getwd()
## [1] "C:/Users/parvp/Desktop/data analytics internship"
salary.df <- read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(salary.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
dim(salary.df)
## [1] 274 13
head(salary.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1 23 2 620 77 87 87 3.4 3.00 1 2
## 2 24 1 610 90 71 87 3.5 4.00 1 2
## 3 24 1 670 99 78 95 3.3 3.25 1 2
## 4 24 1 570 56 81 75 3.3 2.67 1 1
## 5 24 2 710 93 98 98 3.6 3.75 1 2
## 6 24 1 640 82 89 91 3.9 3.75 1 2
## frstlang salary satis
## 1 1 0 7
## 2 1 0 6
## 3 1 0 6
## 4 1 0 7
## 5 1 999 5
## 6 1 0 6
hist(salary.df$age, breaks=20,col="khaki",xlab="Age in years", main="Graph showing age distribution")
salary.df$sex=factor(salary.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(salary.df$sex,col = "skyblue",main = "Gender distribution")
hist(salary.df$work_yrs, breaks=20,col="green",xlab="Work Experience in years", main="Work experience distribution")
hist(salary.df$gmat_tot, breaks=40,col="red",xlab="score out of 800", main="Gmat Score distribution")
salary.df$frstlang = factor(salary.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(salary.df$frstlang,col="cyan",main = "Language Distribution")
New Structure of Dataset after changing age and first language Data Type
str(salary.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "Male","Female": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "English","Others": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
who got placed and disclosed their salary
placed.df<-salary.df[which(salary.df$salary>999),]
View(placed.df)
dim(placed.df)
## [1] 103 13
Who did not get placed and disclosed
unplaced.df<-salary.df[which(salary.df$salary==0),]
View(unplaced.df)
dim(unplaced.df)
## [1] 90 13
Who did not answer the survey(998)
notsurveyed.df<-salary.df[which(salary.df$salary==998),]
View(notsurveyed.df)
dim(notsurveyed.df)
## [1] 46 13
Who answered the survey but did not disclose salary data
notdisclosed.df<-salary.df[which(salary.df$salary==999),]
View(notdisclosed.df)
dim(notdisclosed.df)
## [1] 35 13
Summing all the 4 sub catagories(103+90+46+35)
dim(notdisclosed.df)[1]+dim(placed.df)[1]+dim(unplaced.df)[1]+dim(notsurveyed.df)[1]
## [1] 274
We get a total of 274 wich is equal to our original dimensions
Summary of Placed
describe(placed.df)
## vars n mean sd median trimmed mad min
## age 1 103 26.78 3.27 2.60e+01 26.30 2.97 22.0
## sex* 2 103 1.30 0.46 1.00e+00 1.25 0.00 1.0
## gmat_tot 3 103 616.02 50.69 6.20e+02 615.90 59.30 500.0
## gmat_qpc 4 103 79.73 13.39 8.20e+01 81.05 13.34 39.0
## gmat_vpc 5 103 78.56 16.14 8.10e+01 80.33 16.31 30.0
## gmat_tpc 6 103 84.52 11.01 8.70e+01 85.60 11.86 51.0
## s_avg 7 103 3.09 0.38 3.10e+00 3.10 0.44 2.2
## f_avg 8 103 3.09 0.49 3.25e+00 3.13 0.37 0.0
## quarter 9 103 2.26 1.12 2.00e+00 2.20 1.48 1.0
## work_yrs 10 103 3.68 3.01 3.00e+00 3.11 1.48 0.0
## frstlang* 11 103 1.07 0.25 1.00e+00 1.00 0.00 1.0
## salary 12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis 13 103 5.88 0.78 6.00e+00 5.89 1.48 3.0
## max range skew kurtosis se
## age 40 18.0 1.92 4.90 0.32
## sex* 2 1.0 0.86 -1.28 0.05
## gmat_tot 720 220.0 0.01 -0.69 4.99
## gmat_qpc 99 60.0 -0.81 0.17 1.32
## gmat_vpc 99 69.0 -0.87 0.21 1.59
## gmat_tpc 99 48.0 -0.84 0.19 1.08
## s_avg 4 1.8 -0.13 -0.61 0.04
## f_avg 4 4.0 -2.52 13.86 0.05
## quarter 4 3.0 0.27 -1.34 0.11
## work_yrs 16 16.0 2.48 6.83 0.30
## frstlang* 2 1.0 3.38 9.54 0.02
## salary 220000 156000.0 3.18 17.16 1760.67
## satis 7 4.0 -0.40 0.44 0.08
Average Salary of a placed Grad is 103030.74
boxplot(placed.df$salary, horizontal = TRUE, main="Salary", col="darkolivegreen1")
boxplot(placed.df$age, horizontal = TRUE, main="age distribution for placed Grad", col="darkolivegreen1")
boxplot(placed.df$work_yrs, horizontal = TRUE, main="Work experience of placed Grad", col="darkolivegreen1")
library(car)
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(x = placed.df$salary , y = placed.df$work_yrs, main="comparison of salaries with work experience" , ylab="Work years", xlab="Salary")
scatterplot(x = placed.df$salary , y = placed.df$age, main="comparison of salary with ages", xlab="Salary", ylab="Age")
scatterplot(x = placed.df$salary , y = placed.df$gmat_tot, main="comparison of salary with Gmat Score", xlab="Salary", ylab="Gmat Score")
a <- xtabs(~salary + sex + work_yrs, data = placed.df)
ftable(a)
## work_yrs 0 1 2 3 4 5 6 7 8 10 15 16
## salary sex
## 64000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 1 1 1 0 0 0 0 0 0 0 0
## 86000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 90000 Male 0 0 2 0 0 1 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 92000 Male 0 0 2 0 0 0 0 0 0 0 0 0
## Female 0 0 1 0 0 0 0 0 0 0 0 0
## 93000 Male 0 0 0 0 1 0 0 0 1 0 0 0
## Female 0 0 0 0 0 1 0 0 0 0 0 0
## 95000 Male 1 0 2 1 0 0 0 0 0 0 0 0
## Female 0 1 0 1 0 1 0 0 0 0 0 0
## 96000 Male 0 1 2 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 1 0 0 0 0 0 0 0
## 96500 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 Male 0 0 0 1 1 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 98000 Male 0 0 4 0 1 0 0 1 0 0 0 0
## Female 0 0 3 1 0 0 0 0 0 0 0 0
## 99000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 Male 0 0 1 1 1 0 1 0 0 0 0 0
## Female 0 0 5 0 0 0 0 0 0 0 0 0
## 100400 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 101000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 Male 0 0 0 0 0 0 0 0 1 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 101600 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 102500 Male 0 0 0 0 0 0 1 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 104000 Male 0 0 0 0 2 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 105000 Male 0 0 4 4 0 1 1 0 0 0 0 1
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 106000 Male 0 0 0 0 0 0 1 0 1 0 0 0
## Female 0 0 0 0 0 0 1 0 0 0 0 0
## 107000 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 107300 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 107500 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 108000 Male 0 0 0 1 1 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 110000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 Male 0 0 1 0 0 0 1 0 0 0 0 1
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 115000 Male 0 2 0 1 2 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 118000 Male 0 0 0 0 0 0 0 0 0 1 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 120000 Male 0 0 0 1 0 2 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 1 0 0 0
## 126710 Male 0 0 0 1 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 130000 Male 0 0 0 0 1 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 145800 Male 0 0 1 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 146000 Male 0 0 0 0 0 0 0 0 0 0 1 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 162000 Male 0 1 0 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 Male 0 0 0 0 0 0 0 0 0 0 0 0
## Female 0 0 0 0 0 0 0 0 0 0 1 0
aggregate(salary~age, data=placed.df,FUN = "mean")
## age salary
## 1 22 85000.00
## 2 23 91651.20
## 3 24 101518.75
## 4 25 99086.96
## 5 26 101665.00
## 6 27 102214.29
## 7 28 103625.00
## 8 29 102083.33
## 9 30 109916.67
## 10 31 100500.00
## 11 32 107300.00
## 12 33 118000.00
## 13 34 105000.00
## 14 39 112000.00
## 15 40 183000.00
aggregate(salary~satis, data = placed.df, FUN = "mean")
## satis salary
## 1 3 95000.00
## 2 4 95000.00
## 3 5 102974.34
## 4 6 105364.20
## 5 7 98531.82
It shows that nobody placed has a lower satisfaction level of 1 or 2
Let’s take some hypothesis
mytable1 <-xtabs(~salary+work_yrs,data=placed.df)
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809
Since p<0.01 there is a relationship betweeen work experience and salary
mytable2<-xtabs(~salary+frstlang,data=placed.df)
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296
Since p<0.01 we can say that there is a relationship between first language and salary
mytable3<-xtabs(~salary+gmat_tot,data=placed.df)
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279
Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(salary.df,upper.panel = panel.pie, main="Corrgram of Starting Salaries intercorrelations")
model1: linear model taking age , gmat performance and expereince for consideration indeciding starting salary of mba graduates
fit <- lm( salary~age+gmat_tpc+work_yrs,data=placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33547 -7760 -1788 4647 76796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47565.6 25902.6 1.836 0.0693 .
## age 2455.0 999.0 2.458 0.0157 *
## gmat_tpc -133.9 142.0 -0.943 0.3480
## work_yrs 284.6 1090.2 0.261 0.7946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15630 on 99 degrees of freedom
## Multiple R-squared: 0.2573, Adjusted R-squared: 0.2348
## F-statistic: 11.43 on 3 and 99 DF, p-value: 1.683e-06
model2: linear model taking age,gender, gmat performance and expereince for consideration indeciding starting salary of mba graduates
fit <- lm( salary~age+gmat_tpc+work_yrs+sex,data=placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs + sex, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31132 -8216 -1918 5863 80378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53386.4 26309.1 2.029 0.0451 *
## age 2300.2 1005.4 2.288 0.0243 *
## gmat_tpc -143.1 141.9 -1.008 0.3158
## work_yrs 371.1 1090.4 0.340 0.7343
## sexFemale -4039.5 3400.0 -1.188 0.2377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 98 degrees of freedom
## Multiple R-squared: 0.2678, Adjusted R-squared: 0.2379
## F-statistic: 8.962 on 4 and 98 DF, p-value: 3.282e-06
model3: linear model taking age,first language, gmat performance and expereince for consideration indeciding starting salary of mba graduates
fit <- lm( salary~age+gmat_tpc+work_yrs+frstlang,data=placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ age + gmat_tpc + work_yrs + frstlang, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33397 -8375 -1829 4846 72994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56208.6 26992.5 2.082 0.0399 *
## age 1977.7 1084.6 1.823 0.0713 .
## gmat_tpc -106.1 144.0 -0.737 0.4630
## work_yrs 628.2 1131.1 0.555 0.5799
## frstlangOthers 7677.9 6846.1 1.122 0.2648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15610 on 98 degrees of freedom
## Multiple R-squared: 0.2667, Adjusted R-squared: 0.2368
## F-statistic: 8.91 on 4 and 98 DF, p-value: 3.527e-06
model4: linear model taking job satisfaction, work experience, first language for consideration indeciding starting salary of mba graduates
fit <- lm( salary~satis+work_yrs+frstlang,data=placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31764 -9640 -604 4816 76193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104142.2 11899.4 8.752 5.73e-14 ***
## satis -1913.1 2000.0 -0.957 0.3411
## work_yrs 2506.8 528.6 4.742 7.11e-06 ***
## frstlangOthers 13541.5 6305.7 2.147 0.0342 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared: 0.2466, Adjusted R-squared: 0.2237
## F-statistic: 10.8 on 3 and 99 DF, p-value: 3.354e-06
work_yrs and frstlang are significant variables in this model. The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.
chisq.test(unplaced.df$work_yrs,unplaced.df$satis)
## Warning in chisq.test(unplaced.df$work_yrs, unplaced.df$satis): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: unplaced.df$work_yrs and unplaced.df$satis
## X-squared = 44.974, df = 48, p-value = 0.5976
Therefore, students with work experience and unplaced are satidfied with MBA program