getwd()
## [1] "C:/Users/parvp/Desktop/data analytics internship"
salary.df <- read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(salary.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
dim(salary.df)
## [1] 274  13
head(salary.df)
##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 5  24   2      710       93       98       98   3.6  3.75       1        2
## 6  24   1      640       82       89       91   3.9  3.75       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 5        1    999     5
## 6        1      0     6

Plots

hist(salary.df$age, breaks=20,col="khaki",xlab="Age in years", main="Graph showing age distribution")

salary.df$sex=factor(salary.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(salary.df$sex,col = "skyblue",main = "Gender distribution")

hist(salary.df$work_yrs, breaks=20,col="green",xlab="Work Experience in years", main="Work experience distribution")

hist(salary.df$gmat_tot, breaks=40,col="red",xlab="score out of 800", main="Gmat Score distribution")

salary.df$frstlang = factor(salary.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(salary.df$frstlang,col="cyan",main = "Language Distribution")

New Structure of Dataset after changing age and first language Data Type

str(salary.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "Male","Female": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "English","Others": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Dividing dataset into Placed, Not placed, 999, 998

who got placed and disclosed their salary

placed.df<-salary.df[which(salary.df$salary>999),]
View(placed.df)
dim(placed.df)
## [1] 103  13

Who did not get placed and disclosed

unplaced.df<-salary.df[which(salary.df$salary==0),]
View(unplaced.df)
dim(unplaced.df)
## [1] 90 13

Who did not answer the survey(998)

notsurveyed.df<-salary.df[which(salary.df$salary==998),]
View(notsurveyed.df)
dim(notsurveyed.df)
## [1] 46 13

Who answered the survey but did not disclose salary data

notdisclosed.df<-salary.df[which(salary.df$salary==999),]
View(notdisclosed.df)
dim(notdisclosed.df)
## [1] 35 13

Summing all the 4 sub catagories(103+90+46+35)

dim(notdisclosed.df)[1]+dim(placed.df)[1]+dim(unplaced.df)[1]+dim(notsurveyed.df)[1]
## [1] 274

We get a total of 274 wich is equal to our original dimensions

Summary of Placed

describe(placed.df)
##           vars   n      mean       sd   median   trimmed     mad     min
## age          1 103     26.78     3.27 2.60e+01     26.30    2.97    22.0
## sex*         2 103      1.30     0.46 1.00e+00      1.25    0.00     1.0
## gmat_tot     3 103    616.02    50.69 6.20e+02    615.90   59.30   500.0
## gmat_qpc     4 103     79.73    13.39 8.20e+01     81.05   13.34    39.0
## gmat_vpc     5 103     78.56    16.14 8.10e+01     80.33   16.31    30.0
## gmat_tpc     6 103     84.52    11.01 8.70e+01     85.60   11.86    51.0
## s_avg        7 103      3.09     0.38 3.10e+00      3.10    0.44     2.2
## f_avg        8 103      3.09     0.49 3.25e+00      3.13    0.37     0.0
## quarter      9 103      2.26     1.12 2.00e+00      2.20    1.48     1.0
## work_yrs    10 103      3.68     3.01 3.00e+00      3.11    1.48     0.0
## frstlang*   11 103      1.07     0.25 1.00e+00      1.00    0.00     1.0
## salary      12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis       13 103      5.88     0.78 6.00e+00      5.89    1.48     3.0
##              max    range  skew kurtosis      se
## age           40     18.0  1.92     4.90    0.32
## sex*           2      1.0  0.86    -1.28    0.05
## gmat_tot     720    220.0  0.01    -0.69    4.99
## gmat_qpc      99     60.0 -0.81     0.17    1.32
## gmat_vpc      99     69.0 -0.87     0.21    1.59
## gmat_tpc      99     48.0 -0.84     0.19    1.08
## s_avg          4      1.8 -0.13    -0.61    0.04
## f_avg          4      4.0 -2.52    13.86    0.05
## quarter        4      3.0  0.27    -1.34    0.11
## work_yrs      16     16.0  2.48     6.83    0.30
## frstlang*      2      1.0  3.38     9.54    0.02
## salary    220000 156000.0  3.18    17.16 1760.67
## satis          7      4.0 -0.40     0.44    0.08

Average Salary of a placed Grad is 103030.74

boxplot(placed.df$salary, horizontal = TRUE, main="Salary", col="darkolivegreen1")

boxplot(placed.df$age, horizontal = TRUE, main="age distribution for placed Grad", col="darkolivegreen1")

boxplot(placed.df$work_yrs, horizontal = TRUE, main="Work experience of placed Grad", col="darkolivegreen1")

Comparison of Salary with Work Experience

library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(x = placed.df$salary , y = placed.df$work_yrs, main="comparison of salaries with work experience" , ylab="Work years", xlab="Salary")

Comparison of Salary with age

scatterplot(x = placed.df$salary , y = placed.df$age, main="comparison of salary with ages", xlab="Salary", ylab="Age")

Comparison of Salary with Gmat Score

scatterplot(x = placed.df$salary , y = placed.df$gmat_tot, main="comparison of salary with Gmat Score", xlab="Salary", ylab="Gmat Score")

Comparing different factors that affect starting salaries

a <- xtabs(~salary + sex + work_yrs, data = placed.df)
ftable(a)
##               work_yrs 0 1 2 3 4 5 6 7 8 10 15 16
## salary sex                                       
## 64000  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 1 0 0 0 0 0 0  0  0  0
## 77000  Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 78256  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 1 0 0 0 0 0 0 0  0  0  0
## 82000  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 1 0 0 0 0 0 0 0  0  0  0
## 85000  Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 1 1 1 0 0 0 0 0  0  0  0
## 86000  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 1 1 0 0 0 0 0  0  0  0
## 88000  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 1 0 0 0 0 0  0  0  0
## 88500  Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 90000  Male            0 0 2 0 0 1 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 92000  Male            0 0 2 0 0 0 0 0 0  0  0  0
##        Female          0 0 1 0 0 0 0 0 0  0  0  0
## 93000  Male            0 0 0 0 1 0 0 0 1  0  0  0
##        Female          0 0 0 0 0 1 0 0 0  0  0  0
## 95000  Male            1 0 2 1 0 0 0 0 0  0  0  0
##        Female          0 1 0 1 0 1 0 0 0  0  0  0
## 96000  Male            0 1 2 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 1 0 0 0 0  0  0  0
## 96500  Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 97000  Male            0 0 0 1 1 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 98000  Male            0 0 4 0 1 0 0 1 0  0  0  0
##        Female          0 0 3 1 0 0 0 0 0  0  0  0
## 99000  Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 1 0 0 0  0  0  0
## 100000 Male            0 0 1 1 1 0 1 0 0  0  0  0
##        Female          0 0 5 0 0 0 0 0 0  0  0  0
## 100400 Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 101000 Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 2 0 0 0 0 0 0  0  0  0
## 101100 Male            0 0 0 0 0 0 0 0 1  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 101600 Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 102500 Male            0 0 0 0 0 0 1 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 103000 Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 104000 Male            0 0 0 0 2 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 105000 Male            0 0 4 4 0 1 1 0 0  0  0  1
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 106000 Male            0 0 0 0 0 0 1 0 1  0  0  0
##        Female          0 0 0 0 0 0 1 0 0  0  0  0
## 107000 Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 107300 Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 107500 Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 108000 Male            0 0 0 1 1 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 110000 Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 1 0 0  0  0  0
## 112000 Male            0 0 1 0 0 0 1 0 0  0  0  1
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 115000 Male            0 2 0 1 2 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 118000 Male            0 0 0 0 0 0 0 0 0  1  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 120000 Male            0 0 0 1 0 2 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 1  0  0  0
## 126710 Male            0 0 0 1 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 130000 Male            0 0 0 0 1 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 145800 Male            0 0 1 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 146000 Male            0 0 0 0 0 0 0 0 0  0  1  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 162000 Male            0 1 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  0  0
## 220000 Male            0 0 0 0 0 0 0 0 0  0  0  0
##        Female          0 0 0 0 0 0 0 0 0  0  1  0
aggregate(salary~age, data=placed.df,FUN = "mean")
##    age    salary
## 1   22  85000.00
## 2   23  91651.20
## 3   24 101518.75
## 4   25  99086.96
## 5   26 101665.00
## 6   27 102214.29
## 7   28 103625.00
## 8   29 102083.33
## 9   30 109916.67
## 10  31 100500.00
## 11  32 107300.00
## 12  33 118000.00
## 13  34 105000.00
## 14  39 112000.00
## 15  40 183000.00
aggregate(salary~satis, data = placed.df, FUN = "mean")
##   satis    salary
## 1     3  95000.00
## 2     4  95000.00
## 3     5 102974.34
## 4     6 105364.20
## 5     7  98531.82

It shows that nobody placed has a lower satisfaction level of 1 or 2

Let’s take some hypothesis

mytable1 <-xtabs(~salary+work_yrs,data=placed.df)
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809

Since p<0.01 there is a relationship betweeen work experience and salary

mytable2<-xtabs(~salary+frstlang,data=placed.df)
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296

Since p<0.01 we can say that there is a relationship between first language and salary

mytable3<-xtabs(~salary+gmat_tot,data=placed.df)
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279

Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(salary.df,upper.panel = panel.pie, main="Corrgram of Starting Salaries intercorrelations")

linear regression models

model1: linear model taking age , gmat performance and expereince for consideration indeciding starting salary of mba graduates

fit <- lm( salary~age+gmat_tpc+work_yrs,data=placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33547  -7760  -1788   4647  76796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  47565.6    25902.6   1.836   0.0693 .
## age           2455.0      999.0   2.458   0.0157 *
## gmat_tpc      -133.9      142.0  -0.943   0.3480  
## work_yrs       284.6     1090.2   0.261   0.7946  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15630 on 99 degrees of freedom
## Multiple R-squared:  0.2573, Adjusted R-squared:  0.2348 
## F-statistic: 11.43 on 3 and 99 DF,  p-value: 1.683e-06

model2: linear model taking age,gender, gmat performance and expereince for consideration indeciding starting salary of mba graduates

fit <- lm( salary~age+gmat_tpc+work_yrs+sex,data=placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs + sex, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31132  -8216  -1918   5863  80378 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  53386.4    26309.1   2.029   0.0451 *
## age           2300.2     1005.4   2.288   0.0243 *
## gmat_tpc      -143.1      141.9  -1.008   0.3158  
## work_yrs       371.1     1090.4   0.340   0.7343  
## sexFemale    -4039.5     3400.0  -1.188   0.2377  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15600 on 98 degrees of freedom
## Multiple R-squared:  0.2678, Adjusted R-squared:  0.2379 
## F-statistic: 8.962 on 4 and 98 DF,  p-value: 3.282e-06

model3: linear model taking age,first language, gmat performance and expereince for consideration indeciding starting salary of mba graduates

fit <- lm( salary~age+gmat_tpc+work_yrs+frstlang,data=placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs + frstlang, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33397  -8375  -1829   4846  72994 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     56208.6    26992.5   2.082   0.0399 *
## age              1977.7     1084.6   1.823   0.0713 .
## gmat_tpc         -106.1      144.0  -0.737   0.4630  
## work_yrs          628.2     1131.1   0.555   0.5799  
## frstlangOthers   7677.9     6846.1   1.122   0.2648  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15610 on 98 degrees of freedom
## Multiple R-squared:  0.2667, Adjusted R-squared:  0.2368 
## F-statistic:  8.91 on 4 and 98 DF,  p-value: 3.527e-06

model4: linear model taking job satisfaction, work experience, first language for consideration indeciding starting salary of mba graduates

fit <- lm( salary~satis+work_yrs+frstlang,data=placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31764  -9640   -604   4816  76193 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    104142.2    11899.4   8.752 5.73e-14 ***
## satis           -1913.1     2000.0  -0.957   0.3411    
## work_yrs         2506.8      528.6   4.742 7.11e-06 ***
## frstlangOthers  13541.5     6305.7   2.147   0.0342 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared:  0.2466, Adjusted R-squared:  0.2237 
## F-statistic:  10.8 on 3 and 99 DF,  p-value: 3.354e-06

work_yrs and frstlang are significant variables in this model. The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.

 chisq.test(unplaced.df$work_yrs,unplaced.df$satis)
## Warning in chisq.test(unplaced.df$work_yrs, unplaced.df$satis): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  unplaced.df$work_yrs and unplaced.df$satis
## X-squared = 44.974, df = 48, p-value = 0.5976

Therefore, students with work experience and unplaced are satidfied with MBA program