Reading the data
mba <- read.csv(paste("MBA Starting Salaries Data.csv", sep=" "))
summary
library(psych)
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
boxplot(mba$gmat_tot, main="GMAT score distribution",horizontal=TRUE,col="bisque")
boxplot(mba$gmat_qpc, main="QPC distribution",horizontal=TRUE,col="grey")
boxplot(mba$gmat_vpc, main="VPC distribution",horizontal=TRUE,col="peachpuff")
boxplot(mba$gmat_tpc, main="TPC distribution",horizontal=TRUE,col="violet")
#Vizualizing Spring MBA average and Fall MBA average
par(mfrow=c(1,2))
boxplot(mba$s_avg, main="spring average",col="gold")
boxplot(mba$f_avg, main="fall average",col="darkgreen")
#Vizualizing age distribution
barplot(table(mba$age),col="darkblue")
#Vizualizing experience distribution
barplot(table(mba$work_yrs),main="Number of years of experience",col="lightblue")
#Vizualizing first language distribution
barplot(table(mba$frstlang),main="First Language",xlab="1->English 2-> Others",col="brown")
#Vizualizing salary distribution
hist(mba$salary, main="Salary distribution",xlim=c(50000,220000),ylim=c(0,50),xlab="Salary",ylab="count",col="beige")
#Vizualizing Degree of Satisfaction distribution
#par(mfrow=c(1,2))
hist(mba$satis, main="Degree of satisfaction with MBA program", xlab=" (1= low, 7 = high satisfaction)", xlim = c(1,7),breaks = (1:ceiling(max(mba$satis)/1)*1),col="maroon")
barplot(table(mba$satis),col="magenta")
Scatterplots
#Vizualizing GMAT total vs Salary, Quartile ranking vs Salary, Work experience vs Salary
#par(mfrow=c(1,3))
with(mba, plot(gmat_tot,salary ,cex=1))
with(mba, plot(quarter, salary,cex=1))
with(mba, plot(work_yrs, salary,cex=1))
#We can observe that the person with highest salary is not the one with highest GMAT score instead with below average GMAT score.
#Vizualizing Salary vs Satisfaction
plot(mba$salary,mba$satis,cex=1,ylim=c(1,7),ylab="satisfaction")
#Vizualizing Salary vs Sex
with(mba,plot(salary,sex))
#Vizualizing GMAT total vs Sex, GMAT total vs Spring average, GMAT total vs Fall average
par(mfrow=c(1,3))
with(mba, plot(gmat_tot,sex,cex=1))
with(mba, plot(gmat_tot, s_avg,cex=1))
with(mba, plot(gmat_tot, f_avg,cex=1))
ScatterPlotMatrix
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula=~gmat_tot+gmat_qpc + gmat_vpc+gmat_tpc,data=mba,diagonal="histogram")
scatterplotMatrix(formula=~s_avg + f_avg+work_yrs+salary,data=mba)
correlation between variables
library(corrgram)
library(corrplot)
## corrplot 0.84 loaded
x<-cov(mba[, c(1:10)])
corrplot(corr=cor(mba[ , c(1:10)],use="complete.obs"),
method =c("color"),main="corrplot")
corrgram(mba,order = NULL,panel=panel.cor,lower.panel = panel.shade,text.panel=panel.txt,main="Corrgram")
2a. Identify the crucial managerially relevant question(s) raised in the case 1. Is average gmat score of women > average gmat score of men? Is average salary of men > average salary of women? Do candidates with above average gmat score are placed better than below average gmat score candidates? Which quartile is highly populated? What is the average salary of people with 0-2 years of work experience? What is the relation candidates who are not yet placed with gmat score Figure out how to answer these questions using the given dataset, using R
notplaced <- mba[which(mba$salary==0), ]
placed<- mba[which (mba$salary > 1000) , ]
#1. Comparing gmat scores of men and women
aggregate(gmat_tot ~ sex, data = mba, mean)
## sex gmat_tot
## 1 1 621.2136
## 2 2 614.1176
#2. Comparing salaries of men and women
aggregate(salary ~ sex, data = mba, mean)
## sex salary
## 1 1 37013.62
## 2 2 45121.07
#3. Comparing below average and above average gmat scorers
x<-mean(mba$gmat_tot)
above_avg_gmat<- mba[which (placed$gmat_tot > x) , ]
below_avg_gmat<- mba[which (placed$gmat_tot < x) , ]
mean(above_avg_gmat$salary)
## [1] 34119.94
mean(below_avg_gmat$salary)
## [1] 38999.37
We can observe that below average candidates have performed better at placements
aggregate(salary ~ quarter, data = mba, mean)
## quarter salary
## 1 1 54166.28
## 2 2 37261.01
## 3 3 34037.40
## 4 4 30225.80
aggregate(salary~quarter,data=above_avg_gmat,mean)
## quarter salary
## 1 1 50930.3333
## 2 2 499.1667
aggregate(salary~quarter,data=below_avg_gmat,mean)
## quarter salary
## 1 1 57696.39
## 2 2 436.75
freshers <- placed[which (placed$salary >0) , ]
mean(freshers$salary)
## [1] 103030.7
aggregate(gmat_tot~quarter,data=notplaced,mean)
## quarter gmat_tot
## 1 1 631.1111
## 2 2 605.5556
## 3 3 611.7391
## 4 4 614.0909
2b. Who got how much salary?
#Contingency table showing the affect of variousfactors on the starting salary
mytable <-xtabs(~sex+salary,data=placed)
mytable
## salary
## sex 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
## 1 0 1 0 0 1 0 0 1 3 2 2
## 2 1 0 1 1 3 2 1 0 0 1 1
## salary
## sex 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
## 1 4 3 1 2 6 0 4 1 0 1 1
## 2 3 1 0 0 4 1 5 0 2 0 0
## salary
## sex 102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
## 1 1 1 2 11 2 1 1 1 2 0
## 2 0 0 0 0 1 0 0 0 0 1
## salary
## sex 112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
## 1 3 5 1 3 1 1 1 1 1 0
## 2 0 0 0 1 0 0 0 0 0 1
mytable1 <-xtabs(~salary+work_yrs,data=placed)
mytable1
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
#Work experience is an added advantage while getting placed
mytable2<-xtabs(~salary+frstlang,data=placed)
mytable2
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
#English communication is important
mytable3<-xtabs(~salary+gmat_tot,data=placed)
mytable3
## gmat_tot
## salary 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 64000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 86000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 88500 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 90000 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 93000 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0
## 95000 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0
## 96000 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0
## 96500 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 98000 0 0 0 0 0 1 3 1 1 0 1 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 2 0 1 0 1 1 0 1 0 2 0
## 100400 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 101000 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 101600 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 102500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 104000 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 105000 0 0 0 0 2 0 2 3 0 1 0 1 0 0 1 0
## 106000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 108000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 112000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 115000 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gmat_tot
## salary 670 680 700 710 720
## 64000 0 0 0 0 0
## 77000 0 0 0 0 0
## 78256 0 0 0 0 0
## 82000 1 0 0 0 0
## 85000 0 0 1 0 1
## 86000 0 1 0 0 0
## 88000 0 0 0 0 0
## 88500 0 0 0 0 0
## 90000 0 0 0 0 0
## 92000 0 0 0 1 0
## 93000 0 0 0 0 0
## 95000 2 0 0 0 0
## 96000 0 0 0 0 0
## 96500 0 0 0 0 0
## 97000 0 0 0 0 0
## 98000 1 1 0 1 0
## 99000 0 0 0 0 0
## 100000 0 0 0 1 0
## 100400 0 0 0 0 0
## 101000 0 0 0 0 0
## 101100 0 0 0 0 0
## 101600 0 0 0 0 0
## 102500 1 0 0 0 0
## 103000 0 0 0 0 0
## 104000 0 0 0 0 0
## 105000 0 1 0 0 0
## 106000 0 2 0 0 0
## 107000 0 0 0 0 0
## 107300 0 0 0 0 0
## 107500 0 0 0 0 0
## 108000 0 0 0 0 0
## 110000 0 0 0 0 0
## 112000 1 1 0 0 0
## 115000 0 0 0 1 0
## 118000 0 0 0 0 0
## 120000 1 0 1 0 0
## 126710 0 0 0 0 0
## 130000 0 0 0 0 0
## 145800 0 0 0 0 0
## 146000 0 0 0 0 0
## 162000 0 0 1 0 0
## 220000 0 0 0 0 0
Chisquare test
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809
#Since p<0.01 there is a relationship betweeen work experience and salary
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296
#Since p<0.01 we can say that there is a relationship between first language and salary
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279
#Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.
T-Test
r t.test(placed$salary,placed$work_yrs,var.equal=TRUE, paired=FALSE)
## ## Two Sample t-test ## ## data: placed$salary and placed$work_yrs ## t = 58.516, df = 204, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 99555.62 106498.49 ## sample estimates: ## mean of x mean of y ## 1.030307e+05 3.679612e+00 Regression Model
model1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = placed)
summary(model1)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40370 -8250 -2164 5253 100097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109539.54 48054.24 2.279 0.0248 *
## gmat_tot 55.01 181.71 0.303 0.7627
## gmat_qpc 718.40 541.90 1.326 0.1880
## gmat_vpc 546.10 543.85 1.004 0.3178
## gmat_tpc -1663.16 801.57 -2.075 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared: 0.06089, Adjusted R-squared: 0.02256
## F-statistic: 1.589 on 4 and 98 DF, p-value: 0.1834
#Gmat_tpc is a significant variable in model 1 The multiple R squared value indicates that the model accounts for 6% of the variance in the variables
#Model 2
model2<- lm(salary ~satis+work_yrs+frstlang, data = placed)
summary(model2)
##
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31764 -9640 -604 4816 76193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90600.7 13050.3 6.942 4.07e-10 ***
## satis -1913.1 2000.0 -0.957 0.3411
## work_yrs 2506.8 528.6 4.742 7.11e-06 ***
## frstlang 13541.5 6305.7 2.147 0.0342 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared: 0.2466, Adjusted R-squared: 0.2237
## F-statistic: 10.8 on 3 and 99 DF, p-value: 3.354e-06
#Work_yrs and frstlang are significant variables in model 2 The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.
#Model 3
model3 <- lm(salary ~age+sex, data = placed)
summary(model3)
##
## Call:
## lm(formula = salary ~ age + sex, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29047 -9444 -1750 5428 84503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36859.8 14123.5 2.610 0.0105 *
## age 2653.1 475.1 5.584 2.03e-07 ***
## sex -3743.6 3372.6 -1.110 0.2697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.244
## F-statistic: 17.46 on 2 and 100 DF, p-value: 3.144e-07
#Age is a significant factor in model 3
We see that model 2 is better than model 1 and model 3, with a higher R-squared value. 2c. Comparing students with job and without job
mba$Placed = (mba$salary >1000)
mytable_sex <- xtabs(~ Placed+sex, data=mba)
round(ftable(addmargins(prop.table(mytable_sex))),2)
## sex 1 2 Sum
## Placed
## FALSE 0.49 0.14 0.62
## TRUE 0.26 0.11 0.38
## Sum 0.75 0.25 1.00
#26% of men were placed and 11% of women were placed and on the contrary 49% of men were not placed and only 14% of women were not placed.
mytable_frstlang <- xtabs(~ Placed+frstlang, data=mba)
round(ftable(addmargins(prop.table(mytable_frstlang))),2)
## frstlang 1 2 Sum
## Placed
## FALSE 0.53 0.09 0.62
## TRUE 0.35 0.03 0.38
## Sum 0.88 0.12 1.00
Chi square test Hypothesis H1: Satisfaction of MBA course does not depend on salary
chisq.test(mytable_sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable_sex
## X-squared = 2.033, df = 1, p-value = 0.1539
chisq.test(mytable_frstlang)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable_frstlang
## X-squared = 3.0938, df = 1, p-value = 0.07859