start_salary <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(psych)
describe(start_salary)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
Placed <- start_salary[which (start_salary$salary > 999), ]
View(Placed)
Not_Placed <- start_salary[which (start_salary$salary == 0), ]
View(Not_Placed)
Salary_not_disclosed <- start_salary[which (start_salary$salary == 999), ]
View(Salary_not_disclosed)
Not_answered <- start_salary[which (start_salary$salary == 998), ]
View(Not_answered)
Salary_known <- start_salary[which((start_salary$salary == 0) | (start_salary$salary > 999)), ]
View(Salary_known)
Satis_known <- start_salary[which(start_salary$satis < 8), ]
View(Satis_known)
boxplot(start_salary$age,
horizontal = TRUE,
main = "Box Plot for Age",
xlab = "Age",
col = "tomato"
)
counts_sex <-table(start_salary$sex)
barplot(counts_sex, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("darkblue","pink"),
names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))
boxplot(start_salary$gmat_tot,
horizontal = TRUE,
main = "Box Plot for GMAT Total",
xlab = "Score",
col = "blue"
)
boxplot(start_salary$gmat_qpc,
horizontal = TRUE,
main = "Box Plot for Quantitative GMAT Percentile",
xlab = "Percentile",
col = "red"
)
boxplot(start_salary$gmat_vpc,
horizontal = TRUE,
main = "Box Plot for Verbal GMAT Percentile ",
xlab = "Percentile",
col = "green"
)
boxplot(start_salary$gmat_tpc,
horizontal = TRUE,
main = "Box Plot for Overall GMAT Percentile ",
xlab = "Percentile",
col = "yellow"
)
boxplot(start_salary$s_avg,
horizontal = TRUE,
main = "Box Plot for Spring MBA Average ",
xlab = "Average",
col = "navy"
)
boxplot(start_salary$f_avg,
horizontal = TRUE,
main = "Box Plot for Fall MBA Average ",
xlab = "Average",
col = "green"
)
boxplot(start_salary$work_yrs,
horizontal = TRUE,
main = "Box Plot for Years of Work Experience ",
xlab = "Years",
col = "pink"
)
counts_lang <-table(start_salary$frstlang)
barplot(counts_lang, width=1, space=1, main="First Language of Students ",xlab="Language",
col=c("purple","darkgreen"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))
boxplot(Salary_known$salary,
horizontal = TRUE,
main = "Box Plot for Starting Salary ",
xlab = "Salary ",
col = "blue"
)
##Histogram
hist(Satis_known$satis, breaks=5,col="orange",xlab="Satisfaction level,1=low 7=high", main="Satisfaction distribution")
##Scatter plot
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~salary+age+gmat_tot+s_avg+f_avg+work_yrs |sex, data=start_salary, main="Variation of Salary with other variables")
##Corrgram
library(corrgram)
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="MBA starting salary analysis Corrgram")
##Corrplot
library(corrplot)
## corrplot 0.84 loaded
corr <- Salary_known [, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
corr_final <- cor(corr)
corrplot(corr_final, method="circle")
##Pearson Chi-square Test
chi1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(chi1)
## Warning in chisq.test(chi1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi1
## X-squared = 55.494, df = 42, p-value = 0.07929
chi2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(chi2)
## Warning in chisq.test(chi2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi2
## X-squared = 132.64, df = 126, p-value = 0.3252
chi3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(chi3)
## Warning in chisq.test(chi3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi3
## X-squared = 62.016, df = 42, p-value = 0.02384
regress <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
+ satis, data = Salary_known)
summary(regress)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc +
## s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93924 -48912 20019 44376 179796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40696.2 70338.2 0.579 0.5636
## age -4376.4 1916.4 -2.284 0.0235 *
## sex 1749.0 8733.1 0.200 0.8415
## gmat_qpc -246.8 442.3 -0.558 0.5775
## gmat_vpc -381.8 406.9 -0.938 0.3494
## gmat_tpc 571.8 652.3 0.876 0.3819
## s_avg 22054.8 12502.0 1.764 0.0794 .
## f_avg -5868.9 8794.0 -0.667 0.5054
## work_yrs 3252.9 2171.5 1.498 0.1359
## frstlang 13151.7 15529.4 0.847 0.3982
## satis 9967.1 5137.4 1.940 0.0539 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared: 0.07776, Adjusted R-squared: 0.02709
## F-statistic: 1.535 on 10 and 182 DF, p-value: 0.1301
regress <- lm(salary ~ age + sex + gmat_qpc +gmat_tpc + f_avg + work_yrs + frstlang
+ satis, data = Salary_known)
summary(regress)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_tpc + f_avg +
## work_yrs + frstlang + satis, data = Salary_known)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80262 -52304 20427 44021 169869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68519.7 67901.0 1.009 0.3142
## age -4385.4 1915.6 -2.289 0.0232 *
## sex 1776.0 8764.8 0.203 0.8397
## gmat_qpc -153.5 382.5 -0.401 0.6886
## gmat_tpc 294.5 417.2 0.706 0.4811
## f_avg 2627.6 7477.6 0.351 0.7257
## work_yrs 3671.7 2171.7 1.691 0.0926 .
## frstlang 14674.3 15160.5 0.968 0.3343
## satis 9335.3 5152.1 1.812 0.0716 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52720 on 184 degrees of freedom
## Multiple R-squared: 0.05736, Adjusted R-squared: 0.01637
## F-statistic: 1.4 on 8 and 184 DF, p-value: 0.1992
Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
counts_placed <-table(Salary_known$placed)
barplot(counts_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status",
col=c("red","green"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))