PART A: PRIMARY ANALYSIS! READING DATA IN R 1.
setwd("c:/Users/vaibhav/Desktop/DataSets")
data.df<-read.csv("MbaStartingSalariesData.csv")
View(data.df)
SUMMARY OF DATA 2(a)
summary(data.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
2(b)
library(psych)
describe(data.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
REMOVING PEOPLE WHO DIDN’T ANSWER SURVEY 3(a)
newdata.df<-data.df[which(data.df$salary!=998),]
View(newdata.df)
dim(newdata.df)
## [1] 228 13
3(b)SUMMARIZING THE NEW DATA SET
summary(newdata.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:570.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.57 Mean :1.259 Mean :617.3 Mean :80.24
## 3rd Qu.:29.00 3rd Qu.:2.000 3rd Qu.:660.0 3rd Qu.:92.25
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:69.25 1st Qu.:75.00 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :87.00 Median :3.000 Median :3.000
## Mean :77.85 Mean :83.61 Mean :3.031 Mean :3.059
## 3rd Qu.:91.00 3rd Qu.:94.00 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 4.031 Mean :1.105 Mean : 46698
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.:1.000 3rd Qu.: 99250
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. :1.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.566
## 3rd Qu.:6.000
## Max. :7.000
3(c)Finding % unplaced
table1<-with(newdata.df,table(newdata.df$salary))
prop.table(table1)*100
##
## 0 999 64000 77000 78256 82000
## 39.4736842 15.3508772 0.4385965 0.4385965 0.4385965 0.4385965
## 85000 86000 88000 88500 90000 92000
## 1.7543860 0.8771930 0.4385965 0.4385965 1.3157895 1.3157895
## 93000 95000 96000 96500 97000 98000
## 1.3157895 3.0701754 1.7543860 0.4385965 0.8771930 4.3859649
## 99000 100000 100400 101000 101100 101600
## 0.4385965 3.9473684 0.4385965 0.8771930 0.4385965 0.4385965
## 102500 103000 104000 105000 106000 107000
## 0.4385965 0.4385965 0.8771930 4.8245614 1.3157895 0.4385965
## 107300 107500 108000 110000 112000 115000
## 0.4385965 0.4385965 0.8771930 0.4385965 1.3157895 2.1929825
## 118000 120000 126710 130000 145800 146000
## 0.4385965 1.7543860 0.4385965 0.4385965 0.4385965 0.4385965
## 162000 220000
## 0.4385965 0.4385965
->Hence out of total people who did give the survey 39.47% were not placed and remainf 60.53% were placed FINDING THE FACTORS OF WHY ONE WAS NOT PLACED OR PLACED BY COMPAIRING THE AVERAGES OF ALL 4.
placed.df<-newdata.df[which(newdata.df$salary!=0),]
View(placed.df)
unplaced.df<-newdata.df[which(newdata.df$salary==0),]
View(unplaced.df)
COMPARING PLACED AND UNPLACED STUDENTS DATA 5(a) UNPLACED
table2<-with(unplaced.df,table(unplaced.df$sex))
table2
##
## 1 2
## 67 23
prop.table(table2)*100
##
## 1 2
## 74.44444 25.55556
table4<-with(unplaced.df,table(unplaced.df$satis))
table4
##
## 4 5 6 7
## 4 36 40 10
mean.age.unplaced<-mean(unplaced.df$age)
mean.gmat_tot.unplaced<-mean(unplaced.df$gmat_tot)
mean.gmat_qpc.unplaced<-mean(unplaced.df$gmat_qpc)
mean.gmat_vpc.unplaced<-mean(unplaced.df$gmat_vpc)
mean.gmat_tpc.unplaced<-mean(unplaced.df$gmat_tpc)
mean.s_avg.unplaced<-mean(unplaced.df$s_avg)
mean.f_avg.unplaced<-mean(unplaced.df$f_avg)
mean.work_yrs.unplaced<-mean(unplaced.df$work_yrs)
5(b) PLACED
table5<-with(placed.df,table(placed.df$sex))
table5
##
## 1 2
## 102 36
prop.table(table5)*100
##
## 1 2
## 73.91304 26.08696
table7<-with(placed.df,table(placed.df$satis))
table7
##
## 1 2 3 4 5 6 7
## 1 1 5 13 38 57 23
mean.age.placed<-mean(placed.df$age)
mean.gmat_tot.placed<-mean(placed.df$gmat_tot)
mean.gmat_qpc.placed<-mean(placed.df$gmat_qpc)
mean.gmat_vpc.placed<-mean(placed.df$gmat_vpc)
mean.gmat_tpc.placed<-mean(placed.df$gmat_tpc)
mean.s_avg.placed<-mean(placed.df$s_avg)
mean.f_avg.placed<-mean(placed.df$f_avg)
mean.work_yrs.placed<-mean(placed.df$work_yrs)
5(C)
mean.age.DIFF<-mean(placed.df$age)-mean(unplaced.df$age)
mean.gmat_tot.DIFF<-mean(placed.df$gmat_tot)-mean(unplaced.df$gmat_tot)
mean.gmat_qpc.DIFF<-mean(placed.df$gmat_qpc)-mean(unplaced.df$gmat_qpc)
mean.gmat_vpc.DIFF<-mean(placed.df$gmat_vpc)-mean(unplaced.df$gmat_vpc)
mean.gmat_tpc.DIFF<-mean(placed.df$gmat_tpc)-mean(unplaced.df$gmat_tpc)
mean.s_avg.DIFF<-mean(placed.df$s_avg)-mean(unplaced.df$s_avg)
mean.f_avg.DIFF<-mean(placed.df$f_avg)-mean(unplaced.df$f_avg)
mean.work_yrs.DIFF<-mean(placed.df$work_yrs)-mean(unplaced.df$work_yrs)
mean.age.DIFF
## [1] -1.554589
mean.gmat_tot.DIFF
## [1] 4.942029
mean.gmat_qpc.DIFF
## [1] 2.190338
mean.gmat_vpc.DIFF
## [1] 0.3594203
mean.gmat_tpc.DIFF
## [1] 2.189372
mean.s_avg.DIFF
## [1] -0.000352657
mean.f_avg.DIFF
## [1] -0.005777778
mean.work_yrs.DIFF
## [1] -0.9222222
EXECUTIVE SUMMARY 1: General Understanding 1.Age Age of unplaced people is higher than that of placed people by almost 1.5 years Hence in general people with lesser age did get placed 2.Sex Placed:Out of all placed 74% are male rest are female Unplaced:Out of all unplaced ~74% are male rest are female Hence Sex is not an issue as overall number of males are high, number of placed and unplaced males are high too 3.gmat_tot: GMAT total score The difference in GMAT score of placed and unplaced people is 4.94. Hence people who scored more on an average did get placed. 4.gmat_qpc: Quantitative GMAT percentile The difference in quantitative GMAT percentile of placed and unplaced people is 2.19. Hence people who have higher Percentile did get placed. 5.gmat_vpc:Verbal GMAT percentile The difference in verbal GMAT percentile of placed and unplaced people is 0.359. Hence people who have higher verbalPercentile did get placed. 6.gmat_tpc: overall Gmat percentile The difference in overall GMAT percentile of placed and unplaced people is 2.18 Hence people who have higherPercentile did get placed. 7.s_avg: Spring MBA Average People who are unplaced SUPRISINGLY had higher Spring average 8.f_avg: Fall MBA Average People who are unplaced SUPRISINGLY had higher Fall average 9.Working Years People who are unplaced SUPRISINGLY had more working experience too. 10. Satisfaction People who were unplaced were moe satisfied infact no. one rated below 4.
PART B COMPARISION BETWEEN PLACED AND UNPLACED
newdata.df$sex[newdata.df$sex==1]='Male'
newdata.df$sex[newdata.df$sex==2]='Female'
newdata.df$frstlang[newdata.df$frstlang== 1]='English'
newdata.df$frstlang[newdata.df$frstlang== 2]='Other'
newdata.df$GotPlaced = (newdata.df$salary >1000)
newdata.df$GotPlaced <- factor(newdata.df$GotPlaced)
View(newdata.df)
table8<-with(newdata.df,table(newdata.df$GotPlaced))
table8
##
## FALSE TRUE
## 125 103
125 got placed 103 didnot
placed.df$sex[placed.df$sex==1]='Male'
placed.df$sex[placed.df$sex==2]='Female'
placed.df$frstlang[placed.df$frstlang== 1]='English'
placed.df$frstlang[placed.df$frstlang== 2]='Other'
table9<-with(placed.df,table(placed.df$sex))
table9
##
## Female Male
## 36 102
table10<-with(placed.df,table(placed.df$frstlang))
table10
##
## English Other
## 122 16
table14<-table(placed.df$sex,placed.df$frstlang)
table14
##
## English Other
## Female 31 5
## Male 91 11
102 Males got placed whereas 36 females 122 people who were placed knew English 16 didnt
unplaced.df$sex[unplaced.df$sex==1]='Male'
unplaced.df$sex[unplaced.df$sex==2]='Female'
unplaced.df$frstlang[unplaced.df$frstlang== 1]='English'
unplaced.df$frstlang[unplaced.df$frstlang== 2]='Other'
table11<-with(unplaced.df,table(unplaced.df$sex))
table11
##
## Female Male
## 23 67
table12<-with(unplaced.df,table(unplaced.df$frstlang))
table12
##
## English Other
## 82 8
table13<-table(unplaced.df$sex,unplaced.df$frstlang)
table13
##
## English Other
## Female 22 1
## Male 60 7
67 Males were unplaced 23 females were placed 82 knew English 8 did not PART C:VISUALIZATION Visualizing Salary vs Various factors
par(mfrow=c(2,2))
boxplot(salary~age,data=data.df,main="Salary vs age",horizontal=TRUE)
boxplot(salary~sex,data=data.df,main="Salary vs sex",horizontal=TRUE)
boxplot(salary~frstlang,data=data.df,main="Salary vs Language",horizontal=TRUE)
boxplot(salary~satis,data=data.df,main="Salary vs Satisfaction",horizontal=TRUE)
PART D: REGRESSION Dependent Variable: Salary Independent Variable: age,sex,gmat_tot,gmat_qpc,gmat_vpc,gmat_tpc,s_avg,s_favg,quarter,work_yrs,firstlang,satis
Model1: Seeing how MBA factors like s_avg, f_avg,quarter affect salary
fit1<-lm(placed.df$salary~placed.df$gmat_tot+placed.df$gmat_qpc+placed.df$gmat_vpc+placed.df$work_yrs)
summary(fit1)
##
## Call:
## lm(formula = placed.df$salary ~ placed.df$gmat_tot + placed.df$gmat_qpc +
## placed.df$gmat_vpc + placed.df$work_yrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96032 -23043 16926 28116 114319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 242933.2 81083.6 2.996 0.00326 **
## placed.df$gmat_tot -575.7 303.5 -1.897 0.05997 .
## placed.df$gmat_qpc 918.3 793.0 1.158 0.24893
## placed.df$gmat_vpc 1387.2 721.4 1.923 0.05663 .
## placed.df$work_yrs 2206.0 1469.0 1.502 0.13553
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46240 on 133 degrees of freedom
## Multiple R-squared: 0.06599, Adjusted R-squared: 0.0379
## F-statistic: 2.349 on 4 and 133 DF, p-value: 0.05752