str(mbaStudent)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
#coverting sex, frstlang ,and satis into factor
mbaStudent$sex <- factor(mbaStudent$sex)
mbaStudent$frstlang <- factor(mbaStudent$frstlang)
mbaStudent$satis <- factor(mbaStudent$satis)
str(mbaStudent)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : Factor w/ 8 levels "1","2","3","4",..: 7 6 6 7 5 6 5 6 4 8 ...
summary(mbaStudent)
## age sex gmat_tot gmat_qpc gmat_vpc
## Min. :22.00 1:206 Min. :450.0 Min. :28.00 Min. :16.00
## 1st Qu.:25.00 2: 68 1st Qu.:580.0 1st Qu.:72.00 1st Qu.:71.00
## Median :27.00 Median :620.0 Median :83.00 Median :81.00
## Mean :27.36 Mean :619.5 Mean :80.64 Mean :78.32
## 3rd Qu.:29.00 3rd Qu.:660.0 3rd Qu.:93.00 3rd Qu.:91.00
## Max. :48.00 Max. :790.0 Max. :99.00 Max. :99.00
##
## gmat_tpc s_avg f_avg quarter
## Min. : 0.0 Min. :2.000 Min. :0.000 Min. :1.000
## 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750 1st Qu.:1.250
## Median :87.0 Median :3.000 Median :3.000 Median :2.000
## Mean :84.2 Mean :3.025 Mean :3.062 Mean :2.478
## 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250 3rd Qu.:3.000
## Max. :99.0 Max. :4.000 Max. :4.000 Max. :4.000
##
## work_yrs frstlang salary satis
## Min. : 0.000 1:242 Min. : 0 6 :97
## 1st Qu.: 2.000 2: 32 1st Qu.: 0 5 :74
## Median : 3.000 Median : 999 998 :46
## Mean : 3.872 Mean : 39026 7 :33
## 3rd Qu.: 4.000 3rd Qu.: 97000 4 :17
## Max. :22.000 Max. :220000 3 : 5
## (Other): 2
library(psych)
describe(mbaStudent[,c(1,3:10,12,13)])
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## gmat_tot 2 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 3 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 4 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 5 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 6 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 7 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 8 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 9 274 3.87 3.23 3 3.29 1.48 0 22
## salary 10 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis* 11 274 5.97 1.28 6 6.00 1.48 1 8
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## salary 220000 0.70 -1.05 3078.10
## satis* 7 -0.17 0.28 0.08
boxplot(mbaStudent$age, horizontal = TRUE, main="Age", xlab="years",col="plum")
boxplot(mbaStudent$gmat_tot, horizontal = TRUE, main="GMAT score", xlab="score",col="plum")
boxplot(mbaStudent$gmat_qpc, horizontal = TRUE, main="GMAT Quantitative percentile", xlab="percentile",col="plum")
boxplot(mbaStudent$gmat_vpc, horizontal = TRUE, main="GMAT Verbal percentile", xlab="percentile",col="plum")
boxplot(mbaStudent$gmat_tpc, horizontal = TRUE, main="Total percentile", xlab="percentile",col="plum")
par(mfrow=c(1,2))
boxplot(mbaStudent$s_avg, horizontal = FALSE, main="Spring MBA average", ylab="grade",col="plum")
boxplot(mbaStudent$f_avg, horizontal = FALSE, main="Fall MBA average", ylab="grade",col="plum")
hist(mbaStudent$gmat_tpc, xlab="total percentile", main="GMAT total percentile", col = "dodgerblue3")
hist(mbaStudent$gmat_qpc, xlab="percentile", main="GMAT quantitative percentile", col = "dodgerblue3")
hist(mbaStudent$gmat_vpc, xlab="percentile", main="GMAT verbal percentile", col = "dodgerblue3")
par(mfrow=c(1,2))
hist(mbaStudent$s_avg, xlab="Grade", main="Spring MBA average", col = "dodgerblue3")
hist(mbaStudent$f_avg, xlab="Grade", main="Fall MBA average", col="dodgerblue4")
par(mfrow=c(1,1))
library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(mbaStudent, lower.panel=panel.shade,
upper.panel=panel.pie,
main="Corrgram ")
#one who disclosed there salary are only taken into account
gotJob<- mbaStudent[which(mbaStudent$salary > 1000),]
View(gotJob)
library(corrgram)
corrgram(gotJob, lower.panel=panel.shade,
upper.panel=panel.pie,
main="Corrgram ")
Model1<- lm(salary ~ frstlang + work_yrs + age, gotJob )
Model1
\(salary =\beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \epsilon\)
Model2<- lm(salary ~ frstlang + work_yrs + age + gmat_tot + f_avg + s_avg, gotJob )
Model2
\(salary = \beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \beta_3 gmatTot + \beta_4 fallAvg + \beta_5 springAvg + \epsilon\)
Model3<- lm(salary ~ frstlang + work_yrs + age + gmat_tot + f_avg + s_avg + sex + gmat_qpc + gmat_vpc + gmat_tpc, gotJob )
Model3
\(salary = \beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \beta_3 gmatTot + \beta_4 fallAvg + \beta_5 springAvg + \beta_6 sex + \beta_7 gmatQuant + \beta_8 gmatVerbal + \beta_9 gmatTotalPercentile + \epsilon\)
xtabs(~gotJob$sex + gotJob$satis)
## gotJob$satis
## gotJob$sex 1 2 3 4 5 6 7 998
## 1 0 0 0 1 17 40 14 0
## 2 0 0 1 0 12 10 8 0
Run a chi-square test Hypothesis: Male are more satisfied with job than females among who got their jobs,. H0: Satisfaction and sex are independent.
myTable1<- xtabs(~satis+sex, data=gotJob)
chisq.test(myTable1)
## Warning in chisq.test(myTable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: myTable1
## X-squared = NaN, df = 7, p-value = NA
Since p.value > 0.05, therefore we can reject the null hypothesis that satisfaction and sex are independent, it is actually dependent.
Run a t-test We wil see weather female are outperforming in semester grades.
Hypothesis: Females have high spring + fall average grades than males. H0: There is no significant difference between male and female avergae semester grade.
aggregate(gotJob$s_avg+gotJob$f_avg, by=list(gotJob$sex), mean)
## Group.1 x
## 1 1 6.110556
## 2 2 6.352258
#summing up both fall and spring grade into one to run t-test
gotJob$s_avg<- gotJob$s_avg + gotJob$f_avg
t.test(gotJob$s_avg ~ gotJob$sex)
##
## Welch Two Sample t-test
##
## data: gotJob$s_avg by gotJob$sex
## t = -1.74, df = 77.562, p-value = 0.08582
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.518269 0.034864
## sample estimates:
## mean in group 1 mean in group 2
## 6.110556 6.352258
#correcting s_avg and f_avg
gotJob<- mbaStudent[which(mbaStudent$salary > 1000),]
Result : We can not say that females are outperforming than males in semester grades. Since p.value is greater than > 0.05. So We accepet the null hypothesis that there is no significant difference between males and females’ semester grade
aggregate(gotJob$s_avg+gotJob$f_avg, by=list(gotJob$sex), mean)
## Group.1 x
## 1 1 6.110556
## 2 2 6.352258
summary(Model1)
##
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age, data = gotJob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31941 -9139 -1086 4793 75526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49039.7 25119.6 1.952 0.0537 .
## frstlang2 8546.9 6728.1 1.270 0.2069
## work_yrs 747.2 1116.9 0.669 0.5050
## age 1892.0 1075.9 1.759 0.0818 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared: 0.2626, Adjusted R-squared: 0.2403
## F-statistic: 11.75 on 3 and 99 DF, p-value: 1.188e-06
summary(Model2)
##
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age + gmat_tot +
## f_avg + s_avg, data = gotJob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34180 -8542 -1237 4941 76534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52885.09 32247.47 1.640 0.1043
## frstlang2 9167.61 7065.92 1.297 0.1976
## work_yrs 649.28 1145.52 0.567 0.5722
## age 1844.76 1110.08 1.662 0.0998 .
## gmat_tot -14.78 31.91 -0.463 0.6442
## f_avg -1023.94 3835.20 -0.267 0.7901
## s_avg 3236.73 5031.86 0.643 0.5216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15770 on 96 degrees of freedom
## Multiple R-squared: 0.2669, Adjusted R-squared: 0.2211
## F-statistic: 5.825 on 6 and 96 DF, p-value: 3.292e-05
summary(Model3)
##
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age + gmat_tot +
## f_avg + s_avg + sex + gmat_qpc + gmat_vpc + gmat_tpc, data = gotJob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30627 -8168 -767 5445 70245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56927.8436 49104.8678 1.159 0.2493
## frstlang2 7704.4426 7289.3746 1.057 0.2933
## work_yrs 775.6177 1131.4164 0.686 0.4947
## age 1702.9961 1124.8254 1.514 0.1335
## gmat_tot -0.2345 168.0014 -0.001 0.9989
## f_avg -1693.3454 3815.2794 -0.444 0.6582
## s_avg 5117.8754 4987.7232 1.026 0.3075
## sex2 -3781.6672 3551.3887 -1.065 0.2897
## gmat_qpc 830.0941 488.3993 1.700 0.0926 .
## gmat_vpc 579.0159 488.6739 1.185 0.2391
## gmat_tpc -1465.3294 705.6349 -2.077 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared: 0.3326, Adjusted R-squared: 0.26
## F-statistic: 4.585 on 10 and 92 DF, p-value: 2.812e-05
We could compare all three models since dependent variable is same in all models. Model3 has highest adjusted R-squared i.e. 26%. So we choose Model3 over Model1 and Model2.
withoutJob<- mbaStudent[which(mbaStudent$salary == 0),]
View(withoutJob)
#Visuzlize Seats Economy between two sets
boxplot(gotJob$age, withoutJob$age ,col=c("skyblue","pink"), horizontal = TRUE, main="Age", xlab="Year ", names=c("with job","without job"))
boxplot(gotJob$gmat_tot, withoutJob$gmat_tot ,col=c("skyblue","pink"), horizontal = TRUE, main="GMAT Total", xlab="Score", names=c("with job","without job"))
boxplot(gotJob$gmat_qpc, withoutJob$gmat_qpc ,col=c("skyblue","pink"), horizontal = TRUE, main="GMAT Qunatitative Percentile", xlab="Percentile", names=c("with job","without job"))
boxplot(gotJob$gmat_vpc, withoutJob$gmat_vpc ,col=c("skyblue","pink"), horizontal = TRUE, main="GMAT Verbal Percentile", xlab="Percentile", names=c("with job","without job"))
boxplot(gotJob$s_avg, withoutJob$s_avg ,col=c("skyblue","pink"), horizontal = TRUE, main="MBA Spring average", xlab="Grades", names=c("with job","without job"))
boxplot(gotJob$f_avg, withoutJob$f_avg ,col=c("skyblue","pink"), horizontal = TRUE, main="MBA Fall average", xlab="Grades", names=c("with job","without job"))
boxplot(gotJob$work_yrs, withoutJob$work_yrs ,col=c("skyblue","pink"), horizontal = TRUE, main="Working Years", xlab="Year experience", names=c("with job","without job"))
>RESULT Students having good semester grades are more likely to get job. English plays vital role in job market.