setwd("C:/Users/Jaya/Downloads")
mydata.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mydata.df)
and mean, standard deviation, median, mode
summary(mydata.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
str(mydata.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
library(psych)
describe(mydata.df)[,c(1:5)]
## vars n mean sd median
## age 1 274 27.36 3.71 27
## sex 2 274 1.25 0.43 1
## gmat_tot 3 274 619.45 57.54 620
## gmat_qpc 4 274 80.64 14.87 83
## gmat_vpc 5 274 78.32 16.86 81
## gmat_tpc 6 274 84.20 14.02 87
## s_avg 7 274 3.03 0.38 3
## f_avg 8 274 3.06 0.53 3
## quarter 9 274 2.48 1.11 2
## work_yrs 10 274 3.87 3.23 3
## frstlang 11 274 1.12 0.32 1
## salary 12 274 39025.69 50951.56 999
## satis 13 274 172.18 371.61 6
to visualize the distribution of each variable independently #Age plot
hist(mydata.df$age,col="blue", xlab="Age in years",main="Age Distribution")
#Demarcating the genders plot
mydata.df$sex=factor(mydata.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mydata.df$sex,col = "red",main = "Gender distribution")
#GMAT scores
hist(mydata.df$gmat_tot, xlab="GMAT total score",main="Distribution of GMAT scores", breaks=20,col="orange")
#Work Experience
hist(mydata.df$work_yrs,col="pink",xlab="No. of years of work experience",main = "Work Experience",breaks = 20)
#First Language
mydata.df$frstlang = factor(mydata.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(mydata.df$frstlang,col="yellow",main = "Language Distribution")
#Satisfaction levels
newdata <- mydata.df[ which(mydata.df$satis<='7'), ]
hist(newdata$satis,breaks =5,col="green",xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction distribution")
#Starting Salary
newdata1 <- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999"), ]
hist(newdata1$salary, breaks=5,col="purple",xlab="starting salary", main="Salary distribution")
#Scatter Plots
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~age, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs age",
xlab="age",
ylab="salary")
scatterplot(salary ~sex, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs sex",
xlab="sex",
ylab="salary")
scatterplot(salary ~frstlang, data=newdata1,
main="Scatter plot of salary vs first language",
xlab="first language",
ylab="salary")
scatterplot(salary ~gmat_tot, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs Gmat total",
xlab="Gmat score",
ylab="salary")
library(corrgram)
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="MBA starting salary analysis Correlogram")
#Variance - Covariance Matrix
x <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc gmat_tpc
## age 1.778562e+01 -29.954933 -14.089729 -0.4564443 -7.5127645
## gmat_tot -2.995493e+01 3196.950561 636.350928 685.4644322 672.4651878
## gmat_qpc -1.408973e+01 636.350928 229.384067 42.7985481 141.4933074
## gmat_vpc -4.564443e-01 685.464432 42.798548 259.2695920 149.8747571
## gmat_tpc -7.512764e+00 672.465188 141.493307 149.8747571 183.0113882
## s_avg 2.626913e-01 3.076706 0.109287 1.1636153 0.9688199
## f_avg -7.513817e-02 2.969557 1.025241 0.2769703 0.7718585
## work_yrs 1.355880e+01 -36.222204 -13.484078 -2.4562014 -8.2897776
## salary -2.918528e+04 -170.881369 22855.717832 2901.3078044 43822.5291991
## s_avg f_avg work_yrs salary
## age 0.2626913 -0.07513817 1.355880e+01 -2.918528e+04
## gmat_tot 3.0767055 2.96955689 -3.622220e+01 -1.708814e+02
## gmat_qpc 0.1092870 1.02524072 -1.348408e+01 2.285572e+04
## gmat_vpc 1.1636153 0.27697026 -2.456201e+00 2.901308e+03
## gmat_tpc 0.9688199 0.77185854 -8.289778e+00 4.382253e+04
## s_avg 0.1436561 0.10251263 2.224652e-01 1.940528e+03
## f_avg 0.1025126 0.26995964 -9.189254e-02 2.443157e+02
## work_yrs 0.2224652 -0.09189254 1.360379e+01 -1.044263e+04
## salary 1940.5276360 244.31568869 -1.044263e+04 2.825177e+09
job.df <- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999" & mydata.df$salary!="0"), ]
head(job.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35 22 Female 660 90 92 94 3.5 3.75 1
## 36 27 Female 700 94 98 98 3.3 3.25 1
## 37 25 Female 680 87 96 96 3.5 2.67 1
## 38 25 Female 650 82 91 93 3.4 3.25 1
## 39 27 Male 710 96 96 98 3.3 3.50 1
## 40 28 Female 620 52 98 87 3.4 3.75 1
## work_yrs frstlang salary satis
## 35 1 English 85000 5
## 36 2 English 85000 6
## 37 2 English 86000 5
## 38 3 English 88000 7
## 39 2 English 92000 6
## 40 5 English 93000 5
From this table we see that most higher starting salaries have been awarded to men.
mytable1 <-xtabs(~salary+work_yrs,data=job.df)
mytable1
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
From the above table that a minimum of 2 years of work experience is necessary
mytable2<-xtabs(~salary+frstlang,data=job.df)
mytable2
## frstlang
## salary English Others
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
It is seen that students with English as first language are mostly preferred and get higher salaries and jobs compared to those whose first language is not English.
mytable3<-xtabs(~salary+gmat_tot,data=job.df)
mytable3
## gmat_tot
## salary 500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
## 64000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 77000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 86000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 88000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 88500 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 90000 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
## 92000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
## 93000 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0
## 95000 0 0 1 0 0 2 0 0 0 0 2 0 0 0 0 0
## 96000 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0
## 96500 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 98000 0 0 0 0 0 1 3 1 1 0 1 0 0 0 0 0
## 99000 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 0 0 0 0 2 0 1 0 1 1 0 1 0 2 0
## 100400 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 101000 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 101600 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 102500 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 103000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 104000 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 105000 0 0 0 0 2 0 2 3 0 1 0 1 0 0 1 0
## 106000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 107500 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 108000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 112000 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 115000 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 120000 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 145800 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 162000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 220000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gmat_tot
## salary 670 680 700 710 720
## 64000 0 0 0 0 0
## 77000 0 0 0 0 0
## 78256 0 0 0 0 0
## 82000 1 0 0 0 0
## 85000 0 0 1 0 1
## 86000 0 1 0 0 0
## 88000 0 0 0 0 0
## 88500 0 0 0 0 0
## 90000 0 0 0 0 0
## 92000 0 0 0 1 0
## 93000 0 0 0 0 0
## 95000 2 0 0 0 0
## 96000 0 0 0 0 0
## 96500 0 0 0 0 0
## 97000 0 0 0 0 0
## 98000 1 1 0 1 0
## 99000 0 0 0 0 0
## 100000 0 0 0 1 0
## 100400 0 0 0 0 0
## 101000 0 0 0 0 0
## 101100 0 0 0 0 0
## 101600 0 0 0 0 0
## 102500 1 0 0 0 0
## 103000 0 0 0 0 0
## 104000 0 0 0 0 0
## 105000 0 1 0 0 0
## 106000 0 2 0 0 0
## 107000 0 0 0 0 0
## 107300 0 0 0 0 0
## 107500 0 0 0 0 0
## 108000 0 0 0 0 0
## 110000 0 0 0 0 0
## 112000 1 1 0 0 0
## 115000 0 0 0 1 0
## 118000 0 0 0 0 0
## 120000 1 0 1 0 0
## 126710 0 0 0 0 0
## 130000 0 0 0 0 0
## 145800 0 0 0 0 0
## 146000 0 0 0 0 0
## 162000 0 0 1 0 0
## 220000 0 0 0 0 0
log.transformed.salary=log(job.df$salary)
t.test(log.transformed.salary~ job.df$sex, var.equal = TRUE)
##
## Two Sample t-test
##
## data: log.transformed.salary by job.df$sex
## t = 2.4552, df = 101, p-value = 0.01579
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01470674 0.13847594
## sample estimates:
## mean in group Male mean in group Female
## 11.55390 11.47731
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809
Since p<0.01 there is a relationship betweeen work experience and salary
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296
Since p<0.01 we can say that there is a relationship between first language and salary
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279
Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.
fit <- lm(job.df$salary ~job.df$gmat_tot+job.df$gmat_qpc+job.df$gmat_vpc+job.df$gmat_tpc, data = job.df)
summary(fit)
##
## Call:
## lm(formula = job.df$salary ~ job.df$gmat_tot + job.df$gmat_qpc +
## job.df$gmat_vpc + job.df$gmat_tpc, data = job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40370 -8250 -2164 5253 100097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109539.54 48054.24 2.279 0.0248 *
## job.df$gmat_tot 55.01 181.71 0.303 0.7627
## job.df$gmat_qpc 718.40 541.90 1.326 0.1880
## job.df$gmat_vpc 546.10 543.85 1.004 0.3178
## job.df$gmat_tpc -1663.16 801.57 -2.075 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared: 0.06089, Adjusted R-squared: 0.02256
## F-statistic: 1.589 on 4 and 98 DF, p-value: 0.1834
Gmat_tpc is a significant variable in model 1 The multiple R squared value indicates that the model accounts for 6% of the variance in the variables The residual error (17670) can be thought of as the average error in predicting salary using the various gmat data available
fit1 <- lm(job.df$salary ~job.df$satis+job.df$work_yrs+job.df$frstlang, data = job.df)
summary(fit1)
##
## Call:
## lm(formula = job.df$salary ~ job.df$satis + job.df$work_yrs +
## job.df$frstlang, data = job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31764 -9640 -604 4816 76193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104142.2 11899.4 8.752 5.73e-14 ***
## job.df$satis -1913.1 2000.0 -0.957 0.3411
## job.df$work_yrs 2506.8 528.6 4.742 7.11e-06 ***
## job.df$frstlangOthers 13541.5 6305.7 2.147 0.0342 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared: 0.2466, Adjusted R-squared: 0.2237
## F-statistic: 10.8 on 3 and 99 DF, p-value: 3.354e-06
work_yrs and frstlang are significant variables in model 2 The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.
fit2 <- lm(job.df$salary ~job.df$age+job.df$sex, data = job.df)
summary(fit2)
##
## Call:
## lm(formula = job.df$salary ~ job.df$age + job.df$sex, data = job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29047 -9444 -1750 5428 84503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33116.2 12997.5 2.548 0.0124 *
## job.df$age 2653.1 475.1 5.584 2.03e-07 ***
## job.df$sexFemale -3743.6 3372.6 -1.110 0.2697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.244
## F-statistic: 17.46 on 2 and 100 DF, p-value: 3.144e-07
Age is a significant factor in model 3
We see that model 2 is better than model 1 and model 3, with a higher R-squared value.
nojob.df<- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999" & mydata.df$salary==0), ]
head(nojob.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 Female 620 77 87 87 3.4 3.00 1
## 2 24 Male 610 90 71 87 3.5 4.00 1
## 3 24 Male 670 99 78 95 3.3 3.25 1
## 4 24 Male 570 56 81 75 3.3 2.67 1
## 6 24 Male 640 82 89 91 3.9 3.75 1
## 7 25 Male 610 89 74 87 3.4 3.50 1
## work_yrs frstlang salary satis
## 1 2 English 0 7
## 2 2 English 0 6
## 3 2 English 0 6
## 4 1 English 0 7
## 6 2 English 0 6
## 7 2 English 0 5
hist(nojob.df$gmat_tot,
main = "GMAT performance of students with no job",
xlab="GMAT score",
breaks=10,
col = "green")
Distributed between 550-650 for unplaced students while it is more scattered amongst those who do have a job.
chisq.test(nojob.df$work_yrs,nojob.df$satis)
## Warning in chisq.test(nojob.df$work_yrs, nojob.df$satis): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: nojob.df$work_yrs and nojob.df$satis
## X-squared = 44.974, df = 48, p-value = 0.5976