PART A: PRIMARY ANALYSIS! READING DATA IN R 1.

setwd("c:/Users/vaibhav/Desktop/DataSets")
data.df<-read.csv("MbaStartingSalariesData.csv")
View(data.df)

SUMMARY OF DATA 2(a)

summary(data.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

2(b)

library(psych)
describe(data.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

REMOVING PEOPLE WHO DIDN’T ANSWER SURVEY 3(a)

newdata.df<-data.df[which(data.df$salary!=998),]
View(newdata.df)
dim(newdata.df)
## [1] 228  13

3(b)SUMMARIZING THE NEW DATA SET

summary(newdata.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.57   Mean   :1.259   Mean   :617.3   Mean   :80.24  
##  3rd Qu.:29.00   3rd Qu.:2.000   3rd Qu.:660.0   3rd Qu.:92.25  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:69.25   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.000   Median :3.000  
##  Mean   :77.85   Mean   :83.61   Mean   :3.031   Mean   :3.059  
##  3rd Qu.:91.00   3rd Qu.:94.00   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 4.031   Mean   :1.105   Mean   : 46698  
##  3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.:1.000   3rd Qu.: 99250  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :1.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.566  
##  3rd Qu.:6.000  
##  Max.   :7.000

3(c)Finding % unplaced

table1<-with(newdata.df,table(newdata.df$salary))
prop.table(table1)*100
## 
##          0        999      64000      77000      78256      82000 
## 39.4736842 15.3508772  0.4385965  0.4385965  0.4385965  0.4385965 
##      85000      86000      88000      88500      90000      92000 
##  1.7543860  0.8771930  0.4385965  0.4385965  1.3157895  1.3157895 
##      93000      95000      96000      96500      97000      98000 
##  1.3157895  3.0701754  1.7543860  0.4385965  0.8771930  4.3859649 
##      99000     100000     100400     101000     101100     101600 
##  0.4385965  3.9473684  0.4385965  0.8771930  0.4385965  0.4385965 
##     102500     103000     104000     105000     106000     107000 
##  0.4385965  0.4385965  0.8771930  4.8245614  1.3157895  0.4385965 
##     107300     107500     108000     110000     112000     115000 
##  0.4385965  0.4385965  0.8771930  0.4385965  1.3157895  2.1929825 
##     118000     120000     126710     130000     145800     146000 
##  0.4385965  1.7543860  0.4385965  0.4385965  0.4385965  0.4385965 
##     162000     220000 
##  0.4385965  0.4385965

->Hence out of total people who did give the survey 39.47% were not placed and remainf 60.53% were placed FINDING THE FACTORS OF WHY ONE WAS NOT PLACED OR PLACED BY COMPAIRING THE AVERAGES OF ALL 4.

placed.df<-newdata.df[which(newdata.df$salary!=0),]
View(placed.df)
unplaced.df<-newdata.df[which(newdata.df$salary==0),]
View(unplaced.df)

COMPARING PLACED AND UNPLACED STUDENTS DATA 5(a) UNPLACED

table2<-with(unplaced.df,table(unplaced.df$sex))
table2
## 
##  1  2 
## 67 23
prop.table(table2)*100
## 
##        1        2 
## 74.44444 25.55556
table4<-with(unplaced.df,table(unplaced.df$satis))
table4
## 
##  4  5  6  7 
##  4 36 40 10
mean.age.unplaced<-mean(unplaced.df$age)
mean.gmat_tot.unplaced<-mean(unplaced.df$gmat_tot)
mean.gmat_qpc.unplaced<-mean(unplaced.df$gmat_qpc)
mean.gmat_vpc.unplaced<-mean(unplaced.df$gmat_vpc)
mean.gmat_tpc.unplaced<-mean(unplaced.df$gmat_tpc)
mean.s_avg.unplaced<-mean(unplaced.df$s_avg)
mean.f_avg.unplaced<-mean(unplaced.df$f_avg)
mean.work_yrs.unplaced<-mean(unplaced.df$work_yrs)

5(b) PLACED

table5<-with(placed.df,table(placed.df$sex))
table5
## 
##   1   2 
## 102  36
prop.table(table5)*100
## 
##        1        2 
## 73.91304 26.08696
table7<-with(placed.df,table(placed.df$satis))
table7
## 
##  1  2  3  4  5  6  7 
##  1  1  5 13 38 57 23
mean.age.placed<-mean(placed.df$age)
mean.gmat_tot.placed<-mean(placed.df$gmat_tot)
mean.gmat_qpc.placed<-mean(placed.df$gmat_qpc)
mean.gmat_vpc.placed<-mean(placed.df$gmat_vpc)
mean.gmat_tpc.placed<-mean(placed.df$gmat_tpc)
mean.s_avg.placed<-mean(placed.df$s_avg)
mean.f_avg.placed<-mean(placed.df$f_avg)
mean.work_yrs.placed<-mean(placed.df$work_yrs)

5(C)

mean.age.DIFF<-mean(placed.df$age)-mean(unplaced.df$age)
mean.gmat_tot.DIFF<-mean(placed.df$gmat_tot)-mean(unplaced.df$gmat_tot)
mean.gmat_qpc.DIFF<-mean(placed.df$gmat_qpc)-mean(unplaced.df$gmat_qpc)
mean.gmat_vpc.DIFF<-mean(placed.df$gmat_vpc)-mean(unplaced.df$gmat_vpc)
mean.gmat_tpc.DIFF<-mean(placed.df$gmat_tpc)-mean(unplaced.df$gmat_tpc)
mean.s_avg.DIFF<-mean(placed.df$s_avg)-mean(unplaced.df$s_avg)
mean.f_avg.DIFF<-mean(placed.df$f_avg)-mean(unplaced.df$f_avg)
mean.work_yrs.DIFF<-mean(placed.df$work_yrs)-mean(unplaced.df$work_yrs)
mean.age.DIFF
## [1] -1.554589
mean.gmat_tot.DIFF
## [1] 4.942029
mean.gmat_qpc.DIFF
## [1] 2.190338
mean.gmat_vpc.DIFF
## [1] 0.3594203
mean.gmat_tpc.DIFF
## [1] 2.189372
mean.s_avg.DIFF
## [1] -0.000352657
mean.f_avg.DIFF
## [1] -0.005777778
mean.work_yrs.DIFF
## [1] -0.9222222

EXECUTIVE SUMMARY 1: General Understanding 1.Age Age of unplaced people is higher than that of placed people by almost 1.5 years Hence in general people with lesser age did get placed 2.Sex Placed:Out of all placed 74% are male rest are female Unplaced:Out of all unplaced ~74% are male rest are female Hence Sex is not an issue as overall number of males are high, number of placed and unplaced males are high too 3.gmat_tot: GMAT total score The difference in GMAT score of placed and unplaced people is 4.94. Hence people who scored more on an average did get placed. 4.gmat_qpc: Quantitative GMAT percentile The difference in quantitative GMAT percentile of placed and unplaced people is 2.19. Hence people who have higher Percentile did get placed. 5.gmat_vpc:Verbal GMAT percentile The difference in verbal GMAT percentile of placed and unplaced people is 0.359. Hence people who have higher verbalPercentile did get placed. 6.gmat_tpc: overall Gmat percentile The difference in overall GMAT percentile of placed and unplaced people is 2.18 Hence people who have higherPercentile did get placed. 7.s_avg: Spring MBA Average People who are unplaced SUPRISINGLY had higher Spring average 8.f_avg: Fall MBA Average People who are unplaced SUPRISINGLY had higher Fall average 9.Working Years People who are unplaced SUPRISINGLY had more working experience too. 10. Satisfaction People who were unplaced were moe satisfied infact no. one rated below 4.

PART B COMPARISION BETWEEN PLACED AND UNPLACED

newdata.df$sex[newdata.df$sex==1]='Male'
newdata.df$sex[newdata.df$sex==2]='Female'
newdata.df$frstlang[newdata.df$frstlang== 1]='English'
newdata.df$frstlang[newdata.df$frstlang== 2]='Other'
newdata.df$GotPlaced = (newdata.df$salary >1000)
newdata.df$GotPlaced <- factor(newdata.df$GotPlaced)
View(newdata.df)
table8<-with(newdata.df,table(newdata.df$GotPlaced))
table8
## 
## FALSE  TRUE 
##   125   103

125 got placed 103 didnot

placed.df$sex[placed.df$sex==1]='Male'
placed.df$sex[placed.df$sex==2]='Female'
placed.df$frstlang[placed.df$frstlang== 1]='English'
placed.df$frstlang[placed.df$frstlang== 2]='Other'
table9<-with(placed.df,table(placed.df$sex))
table9
## 
## Female   Male 
##     36    102
table10<-with(placed.df,table(placed.df$frstlang))
table10
## 
## English   Other 
##     122      16
table14<-table(placed.df$sex,placed.df$frstlang)
table14
##         
##          English Other
##   Female      31     5
##   Male        91    11

102 Males got placed whereas 36 females 122 people who were placed knew English 16 didnt

unplaced.df$sex[unplaced.df$sex==1]='Male'
unplaced.df$sex[unplaced.df$sex==2]='Female'
unplaced.df$frstlang[unplaced.df$frstlang== 1]='English'
unplaced.df$frstlang[unplaced.df$frstlang== 2]='Other'
table11<-with(unplaced.df,table(unplaced.df$sex))
table11
## 
## Female   Male 
##     23     67
table12<-with(unplaced.df,table(unplaced.df$frstlang))
table12
## 
## English   Other 
##      82       8
table13<-table(unplaced.df$sex,unplaced.df$frstlang)
table13
##         
##          English Other
##   Female      22     1
##   Male        60     7

67 Males were unplaced 23 females were placed 82 knew English 8 did not PART C:VISUALIZATION Visualizing Salary vs Various factors

par(mfrow=c(2,2))
boxplot(salary~age,data=data.df,main="Salary vs age",horizontal=TRUE)
boxplot(salary~sex,data=data.df,main="Salary vs sex",horizontal=TRUE)
boxplot(salary~frstlang,data=data.df,main="Salary vs Language",horizontal=TRUE)
boxplot(salary~satis,data=data.df,main="Salary vs Satisfaction",horizontal=TRUE)

PART D: REGRESSION Dependent Variable: Salary Independent Variable: age,sex,gmat_tot,gmat_qpc,gmat_vpc,gmat_tpc,s_avg,s_favg,quarter,work_yrs,firstlang,satis

Model1: Seeing how MBA factors like s_avg, f_avg,quarter affect salary

fit1<-lm(placed.df$salary~placed.df$gmat_tot+placed.df$gmat_qpc+placed.df$gmat_vpc+placed.df$work_yrs)
summary(fit1)
## 
## Call:
## lm(formula = placed.df$salary ~ placed.df$gmat_tot + placed.df$gmat_qpc + 
##     placed.df$gmat_vpc + placed.df$work_yrs)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96032 -23043  16926  28116 114319 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        242933.2    81083.6   2.996  0.00326 **
## placed.df$gmat_tot   -575.7      303.5  -1.897  0.05997 . 
## placed.df$gmat_qpc    918.3      793.0   1.158  0.24893   
## placed.df$gmat_vpc   1387.2      721.4   1.923  0.05663 . 
## placed.df$work_yrs   2206.0     1469.0   1.502  0.13553   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46240 on 133 degrees of freedom
## Multiple R-squared:  0.06599,    Adjusted R-squared:  0.0379 
## F-statistic: 2.349 on 4 and 133 DF,  p-value: 0.05752