start_salary <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(psych)
describe(start_salary)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
Placed <- start_salary[which (start_salary$salary > 999), ]
View(Placed)

Not_Placed <- start_salary[which (start_salary$salary == 0), ]
View(Not_Placed)

Salary_not_disclosed <- start_salary[which (start_salary$salary == 999), ]
View(Salary_not_disclosed)

Not_answered <- start_salary[which (start_salary$salary == 998), ]
View(Not_answered)

Salary_known <- start_salary[which((start_salary$salary == 0) | (start_salary$salary > 999)), ]
View(Salary_known)

Satis_known <- start_salary[which(start_salary$satis < 8), ]
View(Satis_known)

BOXPLOT

boxplot(start_salary$age,
        horizontal = TRUE,
        main = "Box Plot for Age",
        xlab =  "Age",
        col  =  "tomato"
       )

counts_sex <-table(start_salary$sex)
barplot(counts_sex, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("darkblue","pink"),
        names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))

boxplot(start_salary$gmat_tot,
        horizontal = TRUE,
        main = "Box Plot for GMAT Total",
        xlab =  "Score",
        col  =  "blue"
       )

boxplot(start_salary$gmat_qpc,
        horizontal = TRUE,
        main = "Box Plot for Quantitative GMAT Percentile",
        xlab =  "Percentile",
        col  =  "red"
       )

boxplot(start_salary$gmat_vpc,
        horizontal = TRUE,
        main = "Box Plot for Verbal GMAT Percentile ",
        xlab =  "Percentile",
        col  =  "green"
       )

boxplot(start_salary$gmat_tpc,
        horizontal = TRUE,
        main = "Box Plot for Overall GMAT Percentile ",
        xlab =  "Percentile",
        col  =  "yellow"
       )

boxplot(start_salary$s_avg,
        horizontal = TRUE,
        main = "Box Plot for Spring MBA Average ",
        xlab =  "Average",
        col  =  "navy"
       )

boxplot(start_salary$f_avg,
        horizontal = TRUE,
        main = "Box Plot for Fall MBA Average ",
        xlab =  "Average",
        col  =  "green"
       )

boxplot(start_salary$work_yrs,
        horizontal = TRUE,
        main = "Box Plot for Years of Work Experience ",
        xlab =  "Years",
        col  =  "pink"
       )

counts_lang <-table(start_salary$frstlang)
barplot(counts_lang, width=1, space=1, main="First Language of Students ",xlab="Language", 
        col=c("purple","darkgreen"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))

boxplot(Salary_known$salary,
        horizontal = TRUE,
        main = "Box Plot for Starting Salary ",
        xlab =  "Salary ",
        col  =  "blue"
       )

##Histogram

hist(Satis_known$satis, breaks=5,col="orange",xlab="Satisfaction level,1=low 7=high", main="Satisfaction  distribution")

##Scatter plot

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(~salary+age+gmat_tot+s_avg+f_avg+work_yrs |sex, data=start_salary, main="Variation of Salary with other variables")

##Corrgram

library(corrgram)
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="MBA starting salary analysis Corrgram")

##Corrplot

library(corrplot)
## corrplot 0.84 loaded
corr <- Salary_known [, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
corr_final <- cor(corr)
corrplot(corr_final, method="circle")

##Pearson Chi-square Test

chi1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(chi1)
## Warning in chisq.test(chi1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi1
## X-squared = 55.494, df = 42, p-value = 0.07929
chi2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(chi2)
## Warning in chisq.test(chi2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi2
## X-squared = 132.64, df = 126, p-value = 0.3252
chi3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(chi3)
## Warning in chisq.test(chi3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  chi3
## X-squared = 62.016, df = 42, p-value = 0.02384

Regression Analysis

regress <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
 + satis, data = Salary_known)
summary(regress)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc + 
##     s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93924 -48912  20019  44376 179796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  40696.2    70338.2   0.579   0.5636  
## age          -4376.4     1916.4  -2.284   0.0235 *
## sex           1749.0     8733.1   0.200   0.8415  
## gmat_qpc      -246.8      442.3  -0.558   0.5775  
## gmat_vpc      -381.8      406.9  -0.938   0.3494  
## gmat_tpc       571.8      652.3   0.876   0.3819  
## s_avg        22054.8    12502.0   1.764   0.0794 .
## f_avg        -5868.9     8794.0  -0.667   0.5054  
## work_yrs      3252.9     2171.5   1.498   0.1359  
## frstlang     13151.7    15529.4   0.847   0.3982  
## satis         9967.1     5137.4   1.940   0.0539 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared:  0.07776,    Adjusted R-squared:  0.02709 
## F-statistic: 1.535 on 10 and 182 DF,  p-value: 0.1301
regress <- lm(salary ~ age + sex + gmat_qpc +gmat_tpc  + f_avg + work_yrs + frstlang
 + satis, data = Salary_known)
summary(regress)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_tpc + f_avg + 
##     work_yrs + frstlang + satis, data = Salary_known)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80262 -52304  20427  44021 169869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  68519.7    67901.0   1.009   0.3142  
## age          -4385.4     1915.6  -2.289   0.0232 *
## sex           1776.0     8764.8   0.203   0.8397  
## gmat_qpc      -153.5      382.5  -0.401   0.6886  
## gmat_tpc       294.5      417.2   0.706   0.4811  
## f_avg         2627.6     7477.6   0.351   0.7257  
## work_yrs      3671.7     2171.7   1.691   0.0926 .
## frstlang     14674.3    15160.5   0.968   0.3343  
## satis         9335.3     5152.1   1.812   0.0716 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52720 on 184 degrees of freedom
## Multiple R-squared:  0.05736,    Adjusted R-squared:  0.01637 
## F-statistic:   1.4 on 8 and 184 DF,  p-value: 0.1992

Comparison

Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
counts_placed <-table(Salary_known$placed)
barplot(counts_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status", 
        col=c("red","green"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))