saldata=read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
attach(saldata)
View(saldata)
dim(saldata)
## [1] 274 13
str(saldata)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
saldata$frstlang[saldata$frstlang==1]="English"
saldata$frstlang[saldata$frstlang==2]="Other"
saldata$sex[saldata$sex==1]="Male"
saldata$sex[saldata$sex==2]="Female"
#View(saldata)
saldata$sex=factor(saldata$sex)
saldata$frstlang=factor(saldata$frstlang)
str(saldata)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
surveydone=saldata[satis!=998,]
#View(surveydone)
saltold=saldata[which(salary!=999 & salary!=998),]
#View(saltold)
placed=saldata[which(salary>999),]
#View(placed)
salnot=saldata[which(salary==999),]
#View(salnot)
notplaced=saldata[which(salary==0),]
View(notplaced)
summary(surveydone)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Female: 59 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 Male :169 1st Qu.:570.0 1st Qu.:72.00
## Median :27.00 Median :620.0 Median :83.00
## Mean :27.57 Mean :617.3 Mean :80.24
## 3rd Qu.:29.00 3rd Qu.:660.0 3rd Qu.:92.25
## Max. :48.00 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:69.25 1st Qu.:75.00 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :87.00 Median :3.000 Median :3.000
## Mean :77.85 Mean :83.61 Mean :3.031 Mean :3.059
## 3rd Qu.:91.00 3rd Qu.:94.00 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 English:204 Min. : 0
## 1st Qu.:1.000 1st Qu.: 2.000 Other : 24 1st Qu.: 0
## Median :2.000 Median : 3.000 Median : 999
## Mean :2.478 Mean : 4.031 Mean : 46698
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.: 99250
## Max. :4.000 Max. :22.000 Max. :220000
## satis
## Min. :1.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.566
## 3rd Qu.:6.000
## Max. :7.000
mean(saltold$salary)
## [1] 54985.32
mean(placed$salary)
## [1] 103030.7
par(mfrow=c(1,1))
hist(placed$salary,breaks=20,col="Green", main="Salary of Placed Students")
hist(placed$satis,col="navy blue",main="Satisfaction of Placed Students")
par(mfrow=c(1,2))
hist(saldata$f_avg ,breaks=20,col="brown",main="Fall average of all students")
hist(saldata$s_avg ,breaks=20,col="brown",main="Spring average of all students")
par(mfrow=c(2,2))
boxplot(saldata$gmat_tot,col="orange",main="boxplot of Total GMAT score")
boxplot(saldata$gmat_qpc,col="orange",main="boxplot of quantitative GMAT percentile score")
boxplot(saldata$gmat_vpc,col="orange",main="boxplot of verbal GMAT percentile score")
boxplot(saldata$gmat_tpc,col="orange",main="boxplot of overall GMAT percentile score")
boxplot(saldata$work_yrs,col="purple",main="Boxplot of Work Experience")
boxplot(placed$salary~placed$age,main="Boxplot of Salary vs age of placed students")
library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplot(placed$salary~placed$age,main="ScatterPlot of salary vs age of placed students")
From the above visualisations, we can observe that salary of MBA graduates is higher if their age is higher.
To check if there is any correlation between salary and age, let us run the following correlation test
cor.test(placed$salary,placed$age)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$age
## t = 5.7968, df = 101, p-value = 7.748e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3388862 0.6320523
## sample estimates:
## cor
## 0.4996428
The test shows a positive correlation between salary and age which is statistically significant.
boxplot(placed$salary~placed$sex,col=c("blue","red"), ylab="Salary", xlab="Gender (1=Male, 2=Female)")
From this we can observe that men are paid slightly higher than women.
chisq.test(placed$salary,placed$sex)
## Warning in chisq.test(placed$salary, placed$sex): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: placed$salary and placed$sex
## X-squared = 52.681, df = 41, p-value = 0.1045
Since p-value is very high, therefore w cannot reject the null hypothesis that Salary and sex are independent of each other.
par(mfrow=c(2,2))
scatterplot(placed$salary~placed$gmat_tot,main="scatterplot of Total GMAT score")
scatterplot(placed$salary~placed$gmat_qpc,main="scatter plot of Quantitative GMAT percentile score")
scatterplot(placed$salary~placed$gmat_vpc,main="scatter plot of Verbal GMAT percentile score")
scatterplot(placed$salary~placed$gmat_tpc,main="scatter plot of Total GMAT percentile score")
t.test(placed$gmat_tot,notplaced$gmat_tot)
##
## Welch Two Sample t-test
##
## data: placed$gmat_tot and notplaced$gmat_tot
## t = 0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14.69189 18.06406
## sample estimates:
## mean of x mean of y
## 616.0194 614.3333
t.test(placed$gmat_tpc,notplaced$gmat_tpc)
##
## Welch Two Sample t-test
##
## data: placed$gmat_tpc and notplaced$gmat_tpc
## t = 1.119, df = 155.27, p-value = 0.2649
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.710571 6.181337
## sample estimates:
## mean of x mean of y
## 84.52427 82.28889
Therefore there is no such difference between the GMAT scores of placed and not placed students
library(car)
scatterplotMatrix(~placed$salary+placed$f_avg+placed$s_avg,main="ScatterPlot Matrix of salary, f_avg, s_avg of placed students")
To check if there is any difference between the average semester scores of placed and not placed students
t.test(placed$s_avg,notplaced$s_avg)
##
## Welch Two Sample t-test
##
## data: placed$s_avg and notplaced$s_avg
## t = 1.118, df = 187.46, p-value = 0.265
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04671895 0.16893470
## sample estimates:
## mean of x mean of y
## 3.092330 3.031222
t.test(placed$f_avg,notplaced$f_avg)
##
## Welch Two Sample t-test
##
## data: placed$f_avg and notplaced$f_avg
## t = 0.37631, df = 178.47, p-value = 0.7071
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1210636 0.1781164
## sample estimates:
## mean of x mean of y
## 3.090971 3.062444
There is no significant difference between the scores of placed and not placed students
#plot(placed$frstlang,placed$salary)
table(placed$frstlang)
##
## English Other
## 96 7
table(notplaced$frstlang)
##
## English Other
## 82 8
tab3=xtabs(~(salary>1000)+frstlang,data=saltold)
tab3
## frstlang
## salary > 1000 English Other
## FALSE 82 8
## TRUE 96 7
ptab3=prop.table(tab3,2)
ptab3
## frstlang
## salary > 1000 English Other
## FALSE 0.4606742 0.5333333
## TRUE 0.5393258 0.4666667
chisq.test(ptab3)
## Warning in chisq.test(ptab3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: ptab3
## X-squared = 3.6905e-32, df = 1, p-value = 1
From the above Chisquare test, we can conclude that getting placed is independent of your first language
scatterplot(placed$salary~placed$work_yrs)
boxplot(placed$salary~placed$work_yrs)
To check if there is any difference between the work experience of placed and not placed students
t.test(placed$work_yrs,notplaced$work_yrs)
##
## Welch Two Sample t-test
##
## data: placed$work_yrs and notplaced$work_yrs
## t = -1.6778, df = 156.44, p-value = 0.09538
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.9797552 0.1612007
## sample estimates:
## mean of x mean of y
## 3.679612 4.588889
There is no statistically significant difference between the work experience of placed and not placed students
cor.test(placed$age,placed$work_yrs)
##
## Pearson's product-moment correlation
##
## data: placed$age and placed$work_yrs
## t = 18.669, df = 101, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8281132 0.9176746
## sample estimates:
## cor
## 0.8805247
There is a strong positive correlation between Age and Work Experience of an MBA graduate
cor.test(placed$salary,placed$satis)
##
## Pearson's product-moment correlation
##
## data: placed$salary and placed$satis
## t = -0.40283, df = 101, p-value = 0.6879
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2317788 0.1546729
## sample estimates:
## cor
## -0.0400506
cor.test(placed$satis,placed$f_avg)
##
## Pearson's product-moment correlation
##
## data: placed$satis and placed$f_avg
## t = -1.1915, df = 101, p-value = 0.2363
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.30432392 0.07755874
## sample estimates:
## cor
## -0.117733
cor.test(placed$satis,placed$s_avg)
##
## Pearson's product-moment correlation
##
## data: placed$satis and placed$s_avg
## t = -1.4579, df = 101, p-value = 0.148
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3279779 0.0513868
## sample estimates:
## cor
## -0.1435656
cor.test(placed$satis,placed$quarter)
##
## Pearson's product-moment correlation
##
## data: placed$satis and placed$quarter
## t = 2.322, df = 101, p-value = 0.02224
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03303437 0.40116717
## sample estimates:
## cor
## 0.2251199
There is no significant correlation between satisfaction with MBA program and the average GPAs. However there is a slightly positive correlation between the quartile rank and satisfaction.
cor(surveydone$s_avg,surveydone$quarter)
## [1] -0.7602882
cor(surveydone$f_avg,surveydone$quarter)
## [1] -0.4141323
As the Spring and Fall avereage GPA rise, the quartile rank decreases
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(placed)
model=lm(salary~work_yrs+satis+f_avg+s_avg+gmat_tot+sex+frstlang,data = placed)
summary(model)
##
## Call:
## lm(formula = salary ~ work_yrs + satis + f_avg + s_avg + gmat_tot +
## sex + frstlang, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29707 -8168 -2187 5698 82749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97193.01 25417.81 3.824 0.000235 ***
## work_yrs 2238.46 581.26 3.851 0.000213 ***
## satis -2000.50 2043.62 -0.979 0.330116
## f_avg -1274.09 3825.09 -0.333 0.739803
## s_avg 4265.73 5023.59 0.849 0.397938
## gmat_tot -8.48 31.96 -0.265 0.791361
## sexMale 6128.86 3472.39 1.765 0.080773 .
## frstlangOther 15563.87 6492.06 2.397 0.018469 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15760 on 95 degrees of freedom
## Multiple R-squared: 0.2752, Adjusted R-squared: 0.2217
## F-statistic: 5.152 on 7 and 95 DF, p-value: 5.365e-05
fitted(model)
## 35 36 37 38 39 40 41
## 93984.38 93667.03 97429.25 94755.56 99392.57 102850.84 99574.39
## 42 43 44 45 46 47 48
## 94297.76 104093.20 95662.41 101077.25 92835.62 100874.55 109194.40
## 49 50 51 52 53 54 55
## 98537.91 103840.97 104951.09 103135.33 107359.26 110813.32 113076.20
## 56 57 58 59 60 61 62
## 104050.82 104628.45 103735.50 106091.36 130942.99 108126.44 100434.34
## 63 64 65 66 67 68 69
## 132479.94 106054.52 109689.07 108212.41 107352.17 136616.23 100200.61
## 115 116 117 118 119 120 121
## 88727.40 95174.27 114048.81 99305.20 102669.58 98556.02 100849.22
## 122 123 124 125 126 127 128
## 100302.12 100471.72 113929.59 116429.25 95405.42 104882.73 95174.27
## 129 130 131 132 133 134 135
## 102604.70 107033.94 101777.38 103752.06 133445.60 101430.07 108469.50
## 136 137 138 139 186 187 188
## 96956.12 104758.36 102269.73 99939.67 93886.23 100790.35 97106.70
## 189 190 191 192 193 194 195
## 100786.12 103113.61 98181.33 101881.45 101237.61 90214.63 91465.96
## 196 197 198 199 200 201 202
## 92551.77 114460.11 100794.59 111197.59 100561.78 92764.81 92700.69
## 203 204 205 206 207 208 209
## 111751.49 109508.80 101685.48 107531.55 111673.64 103987.44 98469.67
## 256 257 258 259 260 261 262
## 93706.87 97041.03 96143.10 98227.74 96793.30 106370.70 98561.16
## 263 264 265 266 267 268 269
## 95356.47 98186.87 97588.13 91847.54 89423.04 98597.00 100278.98
## 270 271 272 273 274
## 118076.16 97713.80 103361.21 100957.38 137251.31
predicted.salary=data.frame(fitted(model))
Compare=cbind(predicted.salary,placed$salary)
View(Compare)