setwd("C:/Users/Dell/Desktop/Project/Week 4")
mba_salary=read.csv("MBA Starting Salaries Data.csv")
View(mba_salary)
str(mba_salary)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
We see that a few categorical variables have been integer data type instead of factor data types. Hence, we need to change that.
mba_salary$sex=factor(mba_salary$sex)
mba_salary$frstlang=factor(mba_salary$frstlang)
str(mba_salary)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
summary(mba_salary)
## age sex gmat_tot gmat_qpc gmat_vpc
## Min. :22.00 1:206 Min. :450.0 Min. :28.00 Min. :16.00
## 1st Qu.:25.00 2: 68 1st Qu.:580.0 1st Qu.:72.00 1st Qu.:71.00
## Median :27.00 Median :620.0 Median :83.00 Median :81.00
## Mean :27.36 Mean :619.5 Mean :80.64 Mean :78.32
## 3rd Qu.:29.00 3rd Qu.:660.0 3rd Qu.:93.00 3rd Qu.:91.00
## Max. :48.00 Max. :790.0 Max. :99.00 Max. :99.00
## gmat_tpc s_avg f_avg quarter
## Min. : 0.0 Min. :2.000 Min. :0.000 Min. :1.000
## 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750 1st Qu.:1.250
## Median :87.0 Median :3.000 Median :3.000 Median :2.000
## Mean :84.2 Mean :3.025 Mean :3.062 Mean :2.478
## 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250 3rd Qu.:3.000
## Max. :99.0 Max. :4.000 Max. :4.000 Max. :4.000
## work_yrs frstlang salary satis
## Min. : 0.000 1:242 Min. : 0 Min. : 1.0
## 1st Qu.: 2.000 2: 32 1st Qu.: 0 1st Qu.: 5.0
## Median : 3.000 Median : 999 Median : 6.0
## Mean : 3.872 Mean : 39026 Mean :172.2
## 3rd Qu.: 4.000 3rd Qu.: 97000 3rd Qu.: 7.0
## Max. :22.000 Max. :220000 Max. :998.0
library(psych)
describe(mba_salary)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex* 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang* 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex* 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang* 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
6A. Visualizing the data given about the students
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(gtable)
library(grid)
f1=ggplot(mba_salary,aes(age))+stat_bin(bins=25,color="gray")
f2=ggplot(mba_salary,aes(sex))+stat_count(color="gray")
f3=ggplot(mba_salary,aes(frstlang))+stat_count(color="gray")
f4=ggplot(mba_salary,aes(work_yrs))+stat_bin(bins=25,color="gray")
ff1=ggplot_gtable(ggplot_build(f1))
ff2=ggplot_gtable(ggplot_build(f2))
ff3=ggplot_gtable(ggplot_build(f3))
ff4=ggplot_gtable(ggplot_build(f4))
ff=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
ff=gtable_add_grob(ff,ff1,1,1)
ff=gtable_add_grob(ff,ff2,1,2)
ff=gtable_add_grob(ff,ff3,2,1)
ff=gtable_add_grob(ff,ff4,2,2)
grid.draw(ff)
6B. Visualizing the data given about the performance in GMAT examination
a1=ggplot(mba_salary,aes(gmat_tot))+stat_bin(bins=20,color="gray")
a2=ggplot(mba_salary,aes(gmat_qpc))+stat_bin(bins=20,color="gray")
a3=ggplot(mba_salary,aes(gmat_vpc))+stat_bin(bins=20,color="gray")
a4=ggplot(mba_salary,aes(gmat_tpc))+stat_bin(bins=20,color="gray")
aa1=ggplot_gtable(ggplot_build(a1))
aa2=ggplot_gtable(ggplot_build(a2))
aa3=ggplot_gtable(ggplot_build(a3))
aa4=ggplot_gtable(ggplot_build(a4))
aa=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
aa=gtable_add_grob(aa,aa1,1,1)
aa=gtable_add_grob(aa,aa2,1,2)
aa=gtable_add_grob(aa,aa3,2,1)
aa=gtable_add_grob(aa,aa4,2,2)
grid.draw(aa)
6C. Visualizing the data given about the performance in the MBA program
b1=ggplot(mba_salary,aes(s_avg))+stat_bin(bins=20,color="gray")
b2=ggplot(mba_salary,aes(f_avg))+stat_bin(bins=20,color="gray")
b3=ggplot(mba_salary,aes(quarter))+stat_count(color="gray")
b4=ggplot(mba_salary,aes(satis))+stat_count(color="gray")
bb1=ggplot_gtable(ggplot_build(b1))
bb2=ggplot_gtable(ggplot_build(b2))
bb3=ggplot_gtable(ggplot_build(b3))
bb4=ggplot_gtable(ggplot_build(b4))
bb=gtable(widths=unit(c(1,2),"null"),heights=unit(c(2,1),"null"))
bb=gtable_add_grob(bb,bb1,1,1)
bb=gtable_add_grob(bb,bb2,1,2)
bb=gtable_add_grob(bb,bb3,2,1)
bb=gtable_add_grob(bb,bb4,2,2)
grid.draw(bb)
We have been told that there are some people who did not answer the survey and some who did not disclose their salary. Hence, we need to treat the ‘missing value’ of unrevealed salaries by giving them the value of the average salary.
Salaried=mba_salary[which(mba_salary$salary>=1000),]
Unrevealed=mba_salary[which(mba_salary$salary==999),]
Unrevealed$salary=mean(Salaried$salary)
Placed=rbind(Unrevealed,Salaried)
library(lattice)
histogram(~salary,data=Placed,col="gray",xlab="Salary",main="Histogram of Salary")
Salary of the placed students can be pictorially represented in relation with various variables as shown below:
9A. Scatterplot of Salary versus Age
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary~age,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Age",xlab="Age",ylab="Salary")
9B. Scatterplot of Salary vs Total GMAT Score
scatterplot(salary~gmat_tot,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Total GMAT Score",xlab="Total GMAT Score",ylab="Salary")
9C. Scatterplot of Salary vs Spring MBA Score
scatterplot(salary~s_avg,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Spring MBA Average",xlab="Spring MBA Average",ylab="Salary")
9D. Scatterplot of Salary vs Fall MBA Score
scatterplot(salary~f_avg,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Fall MBA Average",xlab="Fall MBA Average",ylab="Salary")
9E. Scatterplot of Salary vs Quartile Ranking
scatterplot(salary~quarter,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Quartile Ranking",xlab="Quartile Ranking",ylab="Salary")
9F. Scatterplot of Salary vs Years of Work Experience
scatterplot(salary~work_yrs,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Years of Work Experience",xlab="Years of Work Experience",ylab="Salary")
A few points to be noted are:
a.) The Fall MBA Average as a variable is slightly negatively correlated with Salary of the student.
b.) Spring MBA Average is slightly positively correlated with Salary.
c.) On the other hand, Age and Years of Work Experience are strongly positively correlated with Salary.
d.) At the end, the salary is hardly correlated with the Total GMAT Score and Quartile Ranking.
The correlations can be further explored with the help of corrgram.
library(corrgram)
corrgram(Placed,order=FALSE,lower.panel=panel.shade,upper.panel=panel.pie,diag.panel=panel.minmax,text.panel=panel.txt,main="Corrgram of Salaries intercorrelations")
On studying the corrgram closely, we can infer the following:
a.) Age and Years of Work Experience is highly correlated. ‘Work Experience’ is a more relevant metric as far as the case is concerned.
b.) Total GMAT Score, Quantitative GMAT percentile, Verbal GMAT Percentile and Overall GMAT Percentile are highly correlated as well. Hence, we choose to include ‘Total GMAT Score’ and rejecting the rest.
c.) Spring MBA Average, Fall MBA Average and Quartile Ranking are highly correlated too. Among them, we choose ‘Quartile Ranking’ to include in our regression further.
d.) There seem to exist some correlations between Salary with variables like Age, Total GMAT Score, Quartile Ranking. It can be seen clearly in the following scatter plots:
This doesnt tell the correct picture since categorical variables like Sex and First Language Spoken have not been considered. Here are some figures which will enable to understand it better:
aggregate(Placed$salary,by=list(Sex=Placed$sex),mean)
## Sex x
## 1 1 104400.32
## 2 2 99150.27
aggregate(Placed$salary,by=list(First_Language_Spoken=Placed$frstlang),mean)
## First_Language_Spoken x
## 1 1 102021.8
## 2 2 110723.5
The average salary of a male student is more than that of a female and there seems to be no great difference in the salaries with respect to the native language spoken by the student.
If not on the salaries solely, these categorical variables can have an impact when in combination with other variables:
12A. Scatterplot of Salary vs Age|sex
scatterplot(salary~age|sex,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Age",xlab="Age",ylab="Salary")
12B. Scatterplot of Salary vs Total GMAT Score|First language
scatterplot(salary~gmat_tot|frstlang,data=Placed,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatterplot of Salary vs Total GMAT Score",xlab="Total GMAT Score",ylab="Salary")
After visualizing data, we need to formulate a hypothesis which needs to be tested with the help a regression model:
In order to be able to answer this question, we need to develop a null hypothesis which states that
“There exist no factors that influence the starting salaries receieved by the students post MBA”
library(MASS)
t.test(Placed$salary,Placed$work_yrs)
##
## Welch Two Sample t-test
##
## data: Placed$salary and Placed$work_yrs
## t = 78.497, df = 137, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 100431.7 105622.4
## sample estimates:
## mean of x mean of y
## 1.030307e+05 3.666667e+00
Since the value of p is less than 0.05, there is a significant difference in the salaries of people with different years of work experience. Thus, the null hypothesis is rejected.
In order to validate more, we need to develop a regression model as folows:
M1=lm(salary~sex+work_yrs+frstlang+satis+quarter+gmat_tot,data=Placed)
summary(M1)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis + quarter +
## gmat_tot, data = Placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28423 -6666 -1437 3943 91498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97853.222 16352.878 5.984 1.96e-08 ***
## sex2 -4630.516 2719.460 -1.703 0.091 .
## work_yrs 2187.366 445.426 4.911 2.65e-06 ***
## frstlang2 7685.888 3970.182 1.936 0.055 .
## satis -124.958 1105.669 -0.113 0.910
## quarter -1467.974 1059.401 -1.386 0.168
## gmat_tot 2.808 22.885 0.123 0.903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13860 on 131 degrees of freedom
## Multiple R-squared: 0.2278, Adjusted R-squared: 0.1924
## F-statistic: 6.44 on 6 and 131 DF, p-value: 5.719e-06
After removing the statistically insignificant variables,
M2=lm(salary~sex+work_yrs+frstlang,data=Placed)
summary(M2)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang, data = Placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31346 -7497 -1092 4654 88706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95113.7 2140.2 44.442 < 2e-16 ***
## sex2 -4272.1 2689.3 -1.589 0.1145
## work_yrs 2252.3 433.1 5.201 7.26e-07 ***
## frstlang2 6668.9 3701.3 1.802 0.0738 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13800 on 134 degrees of freedom
## Multiple R-squared: 0.2161, Adjusted R-squared: 0.1985
## F-statistic: 12.31 on 3 and 134 DF, p-value: 3.661e-07
Just three variables defining the salaries becomes a risky chance to take with the regression. Hence we chose one from the less statistically insignificant variables.
M3=lm(salary~sex+work_yrs+frstlang+quarter,data=Placed)
summary(M3)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + quarter, data = Placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28751 -6515 -1500 3769 91268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98984 3474 28.491 < 2e-16 ***
## sex2 -4660 2694 -1.730 0.0859 .
## work_yrs 2174 435 4.998 1.79e-06 ***
## frstlang2 7717 3762 2.051 0.0422 *
## quarter -1480 1049 -1.411 0.1606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13750 on 133 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.2044
## F-statistic: 9.798 on 4 and 133 DF, p-value: 5.608e-07
M4=lm(salary~sex+work_yrs+frstlang+gmat_tot,data=Placed)
summary(M4)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + gmat_tot, data = Placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31041 -7532 -1297 4533 88980
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92114.887 14637.024 6.293 4.18e-09 ***
## sex2 -4252.827 2700.518 -1.575 0.1177
## work_yrs 2265.501 439.319 5.157 8.90e-07 ***
## frstlang2 6811.001 3777.404 1.803 0.0736 .
## gmat_tot 4.729 22.834 0.207 0.8362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13850 on 133 degrees of freedom
## Multiple R-squared: 0.2163, Adjusted R-squared: 0.1927
## F-statistic: 9.177 on 4 and 133 DF, p-value: 1.405e-06
M5=lm(salary~sex+work_yrs+frstlang+satis,data=Placed)
summary(M5)
##
## Call:
## lm(formula = salary ~ sex + work_yrs + frstlang + satis, data = Placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31094 -7233 -1258 4672 88838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96134.6 6398.9 15.024 < 2e-16 ***
## sex2 -4249.6 2702.3 -1.573 0.1182
## work_yrs 2259.3 436.7 5.174 8.24e-07 ***
## frstlang2 6508.9 3833.1 1.698 0.0918 .
## satis -187.0 1104.3 -0.169 0.8658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13850 on 133 degrees of freedom
## Multiple R-squared: 0.2162, Adjusted R-squared: 0.1926
## F-statistic: 9.173 on 4 and 133 DF, p-value: 1.415e-06
Hence, we obtained our regression equation as the one shown by M5. It says that “SEX”, “YEARS OF WORK EXPERIENCE”, “FIRST LANGUAGE SPOKEN” and “SATISFACTION LEVEL FROM MBA PROGRAM” affects the salaries.