setwd("~/Desktop/5 SRM Kashish Mukheja/Downoad content")
mba<-read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
View(mba)
colnames(mba)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
library(psych)
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
mbas<-mba[which(mba$salary!=998 & mba$salary!=999), ]
View(mbas)
mbasp<-mbas[which(mbas$salary!=0), ]
View(mbasp)
library(lattice)
histogram(~age,
data=mba,
type="count",
nint=10,
xlab="Age", main="Age distibution")
histogram(~gmat_tot,
data=mba,
type="count",
nint=10,
xlab="gmat_tot", main="Total Gmat scores")
histogram(~work_yrs,
data=mba,
type="count",
nint=10,
xlab="work_yrs", main="Working years")
mean(mba$gmat_tot[mba$frstlang==1])
## [1] 622.2727
mean(mba$gmat_tot[mba$frstlang!=1])
## [1] 598.125
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~age, data=mbasp,
spread=FALSE, smoother.args=list(lty=2),
main="Salary vs Age",
xlab="Age",
ylab="Salary")
scatterplot(salary ~ work_yrs , data=mbasp,
spread=FALSE, smoother.args=list(lty=2),
main="Work Experience Vs Salary",
xlab="Work_yrs",
ylab="Salary")
scatterplot(salary ~ satis , data=mbasp,
spread=FALSE, smoother.args=list(lty=2),
main="Satisfaction Vs Salary",
xlab="Satisfaction Score",
ylab="Salary")
scatterplot(work_yrs ~ age , data=mbasp,
spread=FALSE, smoother.args=list(lty=2),
main="Work Experience Vs Age",
xlab="Age",
ylab="Work Experience")
scatterplot(salary ~ quarter , data=mbasp,
spread=FALSE, smoother.args=list(lty=2),
main="Salary Vs Quarter",
xlab="Quarter",
ylab="Salary")
library(corrgram)
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
##
## ellipse
## The following object is masked from 'package:graphics':
##
## pairs
corrgram(mbas, order = FALSE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt,main = "Corrgram of those who did not get placed and gave information")
library(corrgram)
library(ellipse)
corrgram(mbasp, order = FALSE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt,main = "Corrgram of those who got placed and gave information")
Ho:-There is no significant difference between the starting salaries of Male and Female
H1:-There is a significant difference between the starting salaries of Male and Female
t.test(mbasp$salary[mbasp$sex==1],mbasp$salary[mbasp$sex==2],alternative = "less")
##
## Welch Two Sample t-test
##
## data: mbasp$salary[mbasp$sex == 1] and mbasp$salary[mbasp$sex == 2]
## t = 1.3628, df = 38.115, p-value = 0.9095
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 14421.13
## sample estimates:
## mean of x mean of y
## 104970.97 98524.39
Inference:-Since p-vale>0.05, we accept Ho, hence there is no significant difference in the starting salaries of male and female.
Ho:-There is no significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish
H1:-There is a significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish
t.test(mbasp$salary[mbasp$frstlang==1],mbasp$salary[mbasp$frstlang==2])
##
## Welch Two Sample t-test
##
## data: mbasp$salary[mbasp$frstlang == 1] and mbasp$salary[mbasp$frstlang == 2]
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean of x mean of y
## 101748.6 120614.3
Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish
Ho:-There is no significant difference between the starting salaries of those whose age is less than 27 and greater than 27.
H1:-There is a significant difference between the starting salaries of those whose age is less than 27 and greater than 27.
t.test(mbasp$salary[mbasp$age<=27],mbasp$salary[mbasp$age>27],alternative = "less")
##
## Welch Two Sample t-test
##
## data: mbasp$salary[mbasp$age <= 27] and mbasp$salary[mbasp$age > 27]
## t = -2.2726, df = 38.266, p-value = 0.01437
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -2676.967
## sample estimates:
## mean of x mean of y
## 100011.9 110376.7
Inference:-We reject the Null Hypothesis,since p-vale<0.05.Hence there is a significant difference between the starting salaries of those whose age is less than 27 and greater than 27.The starting salaried of those aged less than or equal to 27 is less than those whose aged greater than 27
fit1<- lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis, data = mbasp)
summary(fit1)
##
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc +
## mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg +
## mbasp$quarter + mbasp$work_yrs + mbasp$satis, data = mbasp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35858 -7483 -6 4104 75509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117561.06 49384.33 2.381 0.0193 *
## mbasp$gmat_tot 58.14 180.07 0.323 0.7475
## mbasp$gmat_qpc 838.14 504.39 1.662 0.0999 .
## mbasp$gmat_vpc 533.01 507.55 1.050 0.2964
## mbasp$gmat_tpc -1670.82 726.00 -2.301 0.0236 *
## mbasp$s_avg -3155.10 8243.61 -0.383 0.7028
## mbasp$f_avg -3237.85 3931.51 -0.824 0.4123
## mbasp$quarter -3146.53 2756.33 -1.142 0.2566
## mbasp$work_yrs 2572.46 571.19 4.504 1.94e-05 ***
## mbasp$satis -68.92 2170.36 -0.032 0.9747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15780 on 93 degrees of freedom
## Multiple R-squared: 0.2887, Adjusted R-squared: 0.2198
## F-statistic: 4.194 on 9 and 93 DF, p-value: 0.0001432
fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis +mbasp$age, data = mbasp)
summary(fit2)
##
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc +
## mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg +
## mbasp$quarter + mbasp$work_yrs + mbasp$satis + mbasp$age,
## data = mbasp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26196 -8241 -324 5297 70000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69019.43 52376.30 1.318 0.1909
## mbasp$gmat_tot 29.52 176.18 0.168 0.8673
## mbasp$gmat_qpc 813.29 492.44 1.652 0.1020
## mbasp$gmat_vpc 489.93 495.74 0.988 0.3256
## mbasp$gmat_tpc -1479.96 713.20 -2.075 0.0408 *
## mbasp$s_avg -3124.32 8046.45 -0.388 0.6987
## mbasp$f_avg -2345.08 3855.93 -0.608 0.5446
## mbasp$quarter -2787.20 2694.67 -1.034 0.3037
## mbasp$work_yrs 360.74 1087.30 0.332 0.7408
## mbasp$satis -719.58 2136.17 -0.337 0.7370
## mbasp$age 2379.27 1004.19 2.369 0.0199 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15410 on 92 degrees of freedom
## Multiple R-squared: 0.3296, Adjusted R-squared: 0.2567
## F-statistic: 4.523 on 10 and 92 DF, p-value: 3.341e-05
fit3<-lm(mbasp$salary ~mbasp$gmat_tot +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis +mbasp$age, data = mbasp)
summary(fit3)
##
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$s_avg + mbasp$f_avg +
## mbasp$quarter + mbasp$work_yrs + mbasp$satis + mbasp$age,
## data = mbasp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25359 -8539 -1011 5144 80571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72607.27 41174.65 1.763 0.0810 .
## mbasp$gmat_tot -14.81 32.12 -0.461 0.6458
## mbasp$s_avg -3736.71 8002.20 -0.467 0.6416
## mbasp$f_avg -779.13 3848.67 -0.202 0.8400
## mbasp$quarter -2054.06 2662.26 -0.772 0.4423
## mbasp$work_yrs 242.38 1117.61 0.217 0.8288
## mbasp$satis -1702.09 2096.83 -0.812 0.4190
## mbasp$age 2512.61 1025.40 2.450 0.0161 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15860 on 95 degrees of freedom
## Multiple R-squared: 0.266, Adjusted R-squared: 0.2119
## F-statistic: 4.918 on 7 and 95 DF, p-value: 9.018e-05
fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$satis +mbasp$age +mbasp$frstlang, data = mbasp)
summary(fit2)
##
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc +
## mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg +
## mbasp$quarter + mbasp$satis + mbasp$age + mbasp$frstlang,
## data = mbasp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24137 -8244 -490 5313 68756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64622.144 49262.731 1.312 0.1929
## mbasp$gmat_tot 8.337 177.818 0.047 0.9627
## mbasp$gmat_qpc 827.849 491.659 1.684 0.0956 .
## mbasp$gmat_vpc 530.807 498.305 1.065 0.2896
## mbasp$gmat_tpc -1436.428 711.446 -2.019 0.0464 *
## mbasp$s_avg -1805.530 8145.604 -0.222 0.8251
## mbasp$f_avg -2741.535 3852.548 -0.712 0.4785
## mbasp$quarter -2647.810 2692.668 -0.983 0.3280
## mbasp$satis -925.938 2140.124 -0.433 0.6663
## mbasp$age 2501.003 559.182 4.473 2.2e-05 ***
## mbasp$frstlang 5156.619 6934.452 0.744 0.4590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared: 0.3328, Adjusted R-squared: 0.2603
## F-statistic: 4.589 on 10 and 92 DF, p-value: 2.778e-05
fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$satis +mbasp$work_yrs +mbasp$frstlang, data = mbasp)
summary(fit2)
##
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc +
## mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg +
## mbasp$quarter + mbasp$satis + mbasp$work_yrs + mbasp$frstlang,
## data = mbasp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31824 -7739 -135 4626 69927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113245.969 48915.660 2.315 0.022830 *
## mbasp$gmat_tot 5.264 180.698 0.029 0.976821
## mbasp$gmat_qpc 888.079 499.786 1.777 0.078887 .
## mbasp$gmat_vpc 637.786 505.676 1.261 0.210406
## mbasp$gmat_tpc -1562.324 720.886 -2.167 0.032800 *
## mbasp$s_avg -892.329 8257.710 -0.108 0.914183
## mbasp$f_avg -3467.223 3891.435 -0.891 0.375260
## mbasp$quarter -2785.408 2734.537 -1.019 0.311063
## mbasp$satis -600.166 2168.554 -0.277 0.782586
## mbasp$work_yrs 2353.141 578.903 4.065 0.000101 ***
## mbasp$frstlang 11549.611 6630.292 1.742 0.084860 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15610 on 92 degrees of freedom
## Multiple R-squared: 0.3114, Adjusted R-squared: 0.2365
## F-statistic: 4.16 on 10 and 92 DF, p-value: 9.275e-05
Concusion from regression analysis:- From the above model, we see that the model 4 best suits the data.Its adjusted R-sqared is 0.2603 and Multiple R-squared is 0.3328.Age is the most statically significant variable here.
mbas$pnp[mbas$salary==0]<-1
mbas$pnp[mbas$salary!=0]<-2
View(mbas)
Ho:-Gender is independent of the people placed H1:-Gender is dependent of the people placed
mbaschi<-xtabs(~sex + pnp,data = mbas)
addmargins(mbaschi)
## pnp
## sex 1 2 Sum
## 1 67 72 139
## 2 23 31 54
## Sum 90 103 193
View(mbaschi)
chisq.test(mbaschi)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbaschi
## X-squared = 0.29208, df = 1, p-value = 0.5889
Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with gender.
Ho:-Language and placement are independant H1:-Language and placement are dependant
mbaschi1<-xtabs(~frstlang + pnp,data = mbas)
addmargins(mbaschi1)
## pnp
## frstlang 1 2 Sum
## 1 82 96 178
## 2 8 7 15
## Sum 90 103 193
View(mbaschi1)
chisq.test(mbaschi1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbaschi1
## X-squared = 0.074127, df = 1, p-value = 0.7854
Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with FirstLanguage.
mbas$cgpa<-(mbas$s_avg+mbas$f_avg)/2
mbas$impr<-((mbas$f_avg-mbas$s_avg)/mbas$s_avg*100)
Ho:-The percent percent change in cgpa and placement are independant
H1:-The percent percent change in cgpa and placement are dependant
mbas$imprpos[mbas$impr>=0] <- 1
mbas$imprpos[mbas$impr<0] <- 0
mbaschi2<-xtabs(~imprpos + pnp,data = mbas)
addmargins(mbaschi2)
## pnp
## imprpos 1 2 Sum
## 0 31 46 77
## 1 59 57 116
## Sum 90 103 193
chisq.test(mbaschi2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbaschi2
## X-squared = 1.6861, df = 1, p-value = 0.1941
Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with change in cgpa.
Ho:-There is no significant difference between Gmat Score of those placed and not placed
H1:-There is a significant difference between the Gmat Score of those placed and not placed
t.test(mbas$gmat_tot[mbas$pnp==1],mbas$gmat_tot[mbas$pnp==2])
##
## Welch Two Sample t-test
##
## data: mbas$gmat_tot[mbas$pnp == 1] and mbas$gmat_tot[mbas$pnp == 2]
## t = -0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -18.06406 14.69189
## sample estimates:
## mean of x mean of y
## 614.3333 616.0194
Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between Gmat Score of those placed and not placed.
Ho:-There is no significant difference between cgpa and placed and not placed
H1:-There is a significant difference between cgpa and placed and not placed
t.test(mbas$cgpa[mbas$pnp==1],mbas$cgpa[mbas$pnp==2])
##
## Welch Two Sample t-test
##
## data: mbas$cgpa[mbas$pnp == 1] and mbas$cgpa[mbas$pnp == 2]
## t = -0.78225, df = 178.77, p-value = 0.4351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.15787336 0.06823905
## sample estimates:
## mean of x mean of y
## 3.046833 3.091650
Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between cgpa of those placed and not placed.