Reading Data

setwd("C:/Users/harsh/Desktop/r")
MBA<- read.csv("MBA Starting Salaries Data.csv")

Summarizing

attach(MBA)
library(psych)
describe(MBA)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Histograms of various parameters

group1 <- MBA[which(satis!=998),]
group2 <- MBA[which(salary>999),]
hist(age, breaks=18, col="skyblue", xlab="Age", main="AGE")

hist(sex, breaks=18, col="lightgreen", xlab="1 : Male 
    2 : Female", main="Sex")

hist(gmat_tot, breaks=18, col="skyblue", xlab="Gmat Total", main="Gmat Total")

hist(gmat_qpc, breaks=18, col="lightgreen", xlab="Gmat qpc", main="Quantitative GMAT Percentile")

hist(gmat_vpc, breaks=18, col="gray", xlab="Gmat Vpc", main="Gmat Verbal Percentile")

hist(gmat_tpc, breaks=18, col="Yellow", xlab="Gmat tpc", main="Gmat Overall Percentile")

hist(group2$salary, breaks=18, col="gray", xlab="Salary", main="Salary of placed students")

hist(group1$satis, breaks=18, col="lightgreen", xlab="Satisfaction", main="Satisfaction")

hist(work_yrs, breaks=18, col="skyblue", xlab="Work Years", main="Work Experience in Years")

Scatterplots

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~ age, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Age", xlab="Age", ylab="Salary")

scatterplot(salary ~ sex, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Sex", xlab="Sex", ylab="Salary")

scatterplot(salary ~ gmat_tpc, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Overall GMAT Percentile ", xlab="gmat percentile", ylab="Salary")

scatterplot(salary ~ s_avg, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Spring MBA AVerage", xlab="Spring MBA AVerage", ylab="Salary")

scatterplot(salary ~ f_avg, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Fall MBA AVerage", xlab="Fall MBA AVerage", ylab="Salary")

scatterplot(salary ~ work_yrs, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs Work Experience", xlab="Work Experience", ylab="Salary")

scatterplot(salary ~ frstlang, data=group2,spread=FALSE, smoother.args=list(lty=2), pch=16, main="Scatterplot of Salary vs First Language", xlab="First Language ( 1= English
                        2= Others)", ylab="Salary")

Corrgram, Creating a Variance-Covariance Matrix

library(corrgram)
corrgram(MBA, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Variables") 

x <- group2[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary", "sex", "frstlang")]
y <- group2[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary","sex", "frstlang")]
cov(x,y)
##                    age      gmat_tot      gmat_qpc      gmat_vpc
## age         10.7045498 -1.305445e+01   -7.22796497  9.505045e-01
## gmat_tot   -13.0544451  2.569294e+03  452.14258519  6.386360e+02
## gmat_qpc    -7.2279650  4.521426e+02  179.18027794  2.045850e+01
## gmat_vpc     0.9505045  6.386360e+02   20.45849990  2.606602e+02
## gmat_tpc    -3.4602132  5.393623e+02   97.03607462  1.393882e+02
## s_avg        0.1938587  3.299562e+00    0.07838473  9.694594e-01
## f_avg       -0.3462517  3.027432e+00    0.64252142  1.803303e-01
## work_yrs     8.6728536 -1.873882e+01   -7.36245955 -1.366838e+00
## salary   29210.5193223 -8.212449e+04 3382.43784504 -3.964803e+04
## sex         -0.2164477 -4.568818e-01   -0.90757662  3.974872e-01
## frstlang     0.2898344 -1.687607e+00    0.04806777 -8.915858e-01
##               gmat_tpc        s_avg         f_avg      work_yrs
## age      -3.460213e+00   0.19385875 -3.462517e-01     8.6728536
## gmat_tot  5.393623e+02   3.29956215  3.027432e+00   -18.7388159
## gmat_qpc  9.703607e+01   0.07838473  6.425214e-01    -7.3624595
## gmat_vpc  1.393882e+02   0.96945936  1.803303e-01    -1.3668380
## gmat_tpc  1.211342e+02   0.58062916  3.785056e-01    -4.3892062
## s_avg     5.806292e-01   0.14325138  8.231046e-02     0.1860480
## f_avg     3.785056e-01   0.08231046  2.378638e-01    -0.3176271
## work_yrs -4.389206e+00   0.18604797 -3.176271e-01     9.0630116
## salary   -2.596339e+04 688.02042071 -9.241129e+02 24458.1995050
## sex      -2.377689e-01   0.01409575  3.725395e-02    -0.1281173
## frstlang -4.575481e-01  -0.01319912 -6.243099e-03     0.1494384
##                 salary           sex      frstlang
## age       2.921052e+04 -2.164477e-01  2.898344e-01
## gmat_tot -8.212449e+04 -4.568818e-01 -1.687607e+00
## gmat_qpc  3.382438e+03 -9.075766e-01  4.806777e-02
## gmat_vpc -3.964803e+04  3.974872e-01 -8.915858e-01
## gmat_tpc -2.596339e+04 -2.377689e-01 -4.575481e-01
## s_avg     6.880204e+02  1.409575e-02 -1.319912e-02
## f_avg    -9.241129e+02  3.725395e-02 -6.243099e-03
## work_yrs  2.445820e+04 -1.281173e-01  1.494384e-01
## salary    3.192940e+08 -1.369577e+03  1.206714e+03
## sex      -1.369577e+03  2.124500e-01  8.756901e-03
## frstlang  1.206714e+03  8.756901e-03  6.396345e-02

Corrgram of the People who were placed and who answered the survey and disclosed their salaries.

corrgram(group2, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Variables") 

Creating Contigency Tables

table1<-xtabs(~age+salary,data=group2)
table2<-xtabs(~sex+salary,data=group2)
table3<-xtabs(~work_yrs+salary,data=group2)
table4<-xtabs(~gmat_tpc+salary,data=group2)
table5<-xtabs(~frstlang+salary,data=group2)
table6<-xtabs(~quarter+salary,data=group2)
table7<-xtabs(~s_avg+salary,data=group2)
table8<-xtabs(~f_avg+salary,data=group2)
table9<-xtabs(~satis+salary,data=group2)

Chi-Square Tests

chisq.test(table1)
## Warning in chisq.test(table1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 717.62, df = 574, p-value = 3.929e-05
chisq.test(table2)
## Warning in chisq.test(table2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table2
## X-squared = 52.681, df = 41, p-value = 0.1045
chisq.test(table3)
## Warning in chisq.test(table3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table3
## X-squared = 535.23, df = 451, p-value = 0.003809
chisq.test(table4)
## Warning in chisq.test(table4): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table4
## X-squared = 1422.2, df = 1230, p-value = 0.0001065
chisq.test(table5)
## Warning in chisq.test(table5): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table5
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(table6)
## Warning in chisq.test(table6): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table6
## X-squared = 129.85, df = 123, p-value = 0.3186
chisq.test(table7)
## Warning in chisq.test(table7): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table7
## X-squared = 792.97, df = 861, p-value = 0.9524
chisq.test(table8)
## Warning in chisq.test(table8): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table8
## X-squared = 596.28, df = 574, p-value = 0.2518
chisq.test(table9)
## Warning in chisq.test(table9): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table9
## X-squared = 109.1, df = 164, p-value = 0.9997

Linear Regression

model1 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + gmat_tot + s_avg  + f_avg  + quarter + work_yrs + frstlang, data = group2)
summary(model1)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + 
##     gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang, 
##     data = group2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25773  -7903   -609   5617  70568 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 70768.71   50785.86   1.393   0.1669  
## age          1718.73    1124.50   1.528   0.1299  
## sex         -3423.13    3566.90  -0.960   0.3398  
## gmat_tpc    -1502.62     706.28  -2.127   0.0361 *
## gmat_vpc      506.10     493.54   1.025   0.3079  
## gmat_qpc      769.13     491.76   1.564   0.1213  
## gmat_tot       39.19     172.21   0.228   0.8205  
## s_avg       -1518.43    8123.94  -0.187   0.8521  
## f_avg       -2363.96    3868.50  -0.611   0.5427  
## quarter     -2702.70    2612.19  -1.035   0.3036  
## work_yrs      758.24    1131.11   0.670   0.5043  
## frstlang     7303.97    7296.86   1.001   0.3195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15360 on 91 degrees of freedom
## Multiple R-squared:  0.3403, Adjusted R-squared:  0.2606 
## F-statistic: 4.268 on 11 and 91 DF,  p-value: 4.112e-05

Itteration(removing two variables having P-value >80%)

model2 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + f_avg  + quarter + work_yrs + frstlang + satis, data = group2)
summary(model2)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + 
##     f_avg + quarter + work_yrs + frstlang + satis, data = group2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26345  -7968   -321   6064  70799 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  79216.9    32739.0   2.420   0.0175 *
## age           1741.3     1113.0   1.565   0.1211  
## sex          -3612.3     3514.6  -1.028   0.3067  
## gmat_tpc     -1448.6      695.3  -2.084   0.0400 *
## gmat_vpc       579.0      357.2   1.621   0.1085  
## gmat_qpc       827.8      357.9   2.313   0.0229 *
## f_avg        -2228.7     3696.2  -0.603   0.5480  
## quarter      -2085.0     1641.9  -1.270   0.2073  
## work_yrs       749.7     1123.3   0.667   0.5062  
## frstlang      7918.3     7129.5   1.111   0.2696  
## satis        -1150.4     2057.9  -0.559   0.5775  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15260 on 92 degrees of freedom
## Multiple R-squared:  0.3421, Adjusted R-squared:  0.2706 
## F-statistic: 4.783 on 10 and 92 DF,  p-value: 1.616e-05

Itteration2(removing two variables with p > 40% and we get the following results)

model3 <- lm(salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + f_avg  + quarter + frstlang, data = group2)
summary(model3)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + gmat_vpc + gmat_qpc + 
##     f_avg + quarter + frstlang, data = group2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24208  -7290   -967   5190  71732 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  63049.8    26242.1   2.403   0.0182 *  
## age           2356.5      545.7   4.319 3.89e-05 ***
## sex          -3109.8     3444.9  -0.903   0.3690    
## gmat_tpc     -1460.8      687.2  -2.126   0.0361 *  
## gmat_vpc       561.9      353.2   1.591   0.1150    
## gmat_qpc       834.9      353.5   2.361   0.0203 *  
## f_avg        -2485.8     3660.2  -0.679   0.4987    
## quarter      -2347.8     1580.4  -1.486   0.1407    
## frstlang      6265.2     6771.5   0.925   0.3572    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15160 on 94 degrees of freedom
## Multiple R-squared:  0.3366, Adjusted R-squared:  0.2801 
## F-statistic: 5.961 on 8 and 94 DF,  p-value: 3.569e-06

Itteration3(removing the variable with p >30%)

model4 <- lm(salary ~ age + gmat_tpc + gmat_vpc + gmat_qpc + quarter, data = group2)
summary(model4)
## 
## Call:
## lm(formula = salary ~ age + gmat_tpc + gmat_vpc + gmat_qpc + 
##     quarter, data = group2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26838  -7493   -245   5380  69938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  46396.9    19354.6   2.397   0.0184 *  
## age           2721.0      467.6   5.819 7.62e-08 ***
## gmat_tpc     -1444.4      680.9  -2.121   0.0365 *  
## gmat_vpc       529.1      350.3   1.511   0.1342    
## gmat_qpc       852.2      348.1   2.448   0.0162 *  
## quarter      -1611.9     1358.9  -1.186   0.2384    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15080 on 97 degrees of freedom
## Multiple R-squared:  0.3227, Adjusted R-squared:  0.2878 
## F-statistic: 9.243 on 5 and 97 DF,  p-value: 3.202e-07

Now we have achieved the model with a relatively higher r-squared value. Thus, a regression model with r-squared value of 28% is derived for the purpose of salary prediction.

Comparision of two groups

Now we will consider two different groups from the given data set, one with students who are placed and other with students who are not placed.

mba <- read.csv('MBA Starting Salaries Data.csv',header=T, na.strings=c(""," ","999","998","NA"))
mba$Placed = (mba$salary >1000)
mytab <- xtabs(~ Placed+sex, data=mba)
round(ftable(addmargins(prop.table(mytab))),2)
##        sex    1    2  Sum
## Placed                   
## FALSE      0.35 0.12 0.47
## TRUE       0.37 0.16 0.53
## Sum        0.72 0.28 1.00
mytab2 <- xtabs(~ Placed+frstlang, data=mba)
round(ftable(addmargins(prop.table(mytab2))),2)
##        frstlang    1    2  Sum
## Placed                        
## FALSE           0.42 0.04 0.47
## TRUE            0.50 0.04 0.53
## Sum             0.92 0.08 1.00

Hypothesis 1, H1: Number of Females is more than Number of Males in placed students

chisq.test(mytab)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytab
## X-squared = 0.29208, df = 1, p-value = 0.5889

Hypothesis 2, H2: The students who are placed and having first language as english are more than students how have first language other than english

chisq.test(mytab2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytab2
## X-squared = 0.074127, df = 1, p-value = 0.7854

SO, > Among placed, Males are in higher proportions than females. > Students with English as the first language are in higher proportion than the ones having other languages as first language in placed students.