Reading the data set into R
mba<- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba)
attach(mba)
dim(mba)
## [1] 274 13
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
Create summary statistics (e.g. mean, standard deviation, median, mode) for the important variables in the dataset.
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
notplaced = mba[which (mba$salary == 0) , ]
placed = mba[which (mba$salary != 0) , ]
notanswered = placed[which (placed$salary == 998) , ]
notdisclosed = placed[which (placed$salary == 999) , ]
a = placed[which (placed$salary != 999) , ]
salarygiven = a[which (a$salary != 998) , ]
View(salarygiven)
boxplot(placed$salary, data=placed , xlab="Salary", main="Boxplot of Salary", horizontal=TRUE,col = "blue")
boxplot(placed$age, data=placed , xlab="Age", main="Boxplot of Age", horizontal=TRUE,col = "yellow")
library(lattice)
histogram(~mba$salary, data = placed,main = "Distribution of Salary", xlab="Salary", col='grey' )
To Draw Scatter Plots to understand how are the variables correlated pair-wise
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~ work_yrs | sex ,data=placed, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="Starting Salaries")
scatterplot(salary ~ satis ,data=placed, main="Scatterplot of Salary with Degree of satisfaction", xlab="Degree of satisfaction", ylab="Starting Salaries")
scatterplot(salary ~ gmat_tot ,data=placed, main="Scatterplot of Salary with GMAT Total Score", xlab="GMAT Total Score", ylab="Starting Salaries")
To draw a Corrgram and to Create a Variance-Covariance Matrix
library(corrgram)
corrgram(placed , order=FALSE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of Placed Variables")
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg+work_yrs, data=placed,
main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
Draw Draw Contingency Tables, as appropriate
gotplaced = placed[which (placed$salary != 998) , ]
View(gotplaced)
table(gotplaced$sex)
##
## 1 2
## 102 36
table(gotplaced$frstlang)
##
## 1 2
## 122 16
x<-table(gotplaced$sex,gotplaced$frstlang)
x
##
## 1 2
## 1 91 11
## 2 31 5
prop.table(x)*100
##
## 1 2
## 1 65.942029 7.971014
## 2 22.463768 3.623188
table(gotplaced$work_yrs)
##
## 0 1 2 3 4 5 6 7 8 9 10 15 16
## 2 10 44 29 22 9 9 3 4 1 1 2 2
To run chi-square tests, as appropriate
mba$isplaced[mba$salary > 1000] <- 1
View(mba)
s <- xtabs(~mba$isplaced + mba$sex, data = mba)
s
## mba$sex
## mba$isplaced 1 2
## 1 72 31
prop.table(s,2)*100
## mba$sex
## mba$isplaced 1 2
## 1 100 100
Hypothesis : Percentage of Femlaes who got placed is more than percentage of males who got placed.
chisq.test(s)
##
## Chi-squared test for given probabilities
##
## data: s
## X-squared = 16.32, df = 1, p-value = 5.349e-05
To run t-tests, as appropriate
Null Hyphothesis 1 : There is no significant difference between the salary of females who are placed and salaries of males who are placed.
t.test(salarygiven$salary~salarygiven$sex)
##
## Welch Two Sample t-test
##
## data: salarygiven$salary by salarygiven$sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
Since p-value > 0.05 , Null hyphothesis is valid and there is no significant difference between the salary of females who are placed and salaries of males who are placed.
Null Hyphothesis 2 : There is no significant difference between the people who are placed who have their first language as English and those who have their first language as not english
t.test(salarygiven$salary~salarygiven$frstlang)
##
## Welch Two Sample t-test
##
## data: salarygiven$salary by salarygiven$frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
Since p-value > 0.05 , Null hyphothesis is valid and there is no significant difference between the people who are placed who have their first language as English and those who have their first language as not english.
Liner REGRESSION Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,gmattpc,sex,frstlang,satis} Dependent Variable: Salary
f1 <- gotplaced$salary ~ gotplaced$work_yrs + gotplaced$s_avg + gotplaced$f_avg + gotplaced$gmat_qpc + gotplaced$gmat_vpc + gotplaced$gmat_tpc + gotplaced$sex + gotplaced$frstlang + gotplaced$satis
lm1 <- lm(f1, data = gotplaced)
summary(lm1)
##
## Call:
## lm(formula = f1, data = gotplaced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -102066 -21162 7023 24234 129791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -85125.9 47151.0 -1.805 0.07336 .
## gotplaced$work_yrs 1486.8 1341.7 1.108 0.26986
## gotplaced$s_avg 32113.7 10752.2 2.987 0.00338 **
## gotplaced$f_avg -1726.0 9014.2 -0.191 0.84845
## gotplaced$gmat_qpc -404.5 364.8 -1.109 0.26955
## gotplaced$gmat_vpc -713.1 326.3 -2.185 0.03067 *
## gotplaced$gmat_tpc 594.5 486.8 1.221 0.22422
## gotplaced$sex 6582.5 7832.9 0.840 0.40227
## gotplaced$frstlang -15675.1 12231.1 -1.282 0.20231
## gotplaced$satis 20286.7 3133.3 6.475 1.85e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38810 on 128 degrees of freedom
## Multiple R-squared: 0.3669, Adjusted R-squared: 0.3223
## F-statistic: 8.241 on 9 and 128 DF, p-value: 1.321e-09
if p- value for independant variables is less than 0.05 , then Salary is significantly dependant on that. SO Slary is significantly dependant on s_avg, gmat_vpc and satis.
f2 <- gotplaced$salary ~ gotplaced$work_yrs + gotplaced$s_avg + gotplaced$gmat_vpc + gotplaced$sex + gotplaced$frstlang + gotplaced$satis
lm2 <- lm(f2, data = gotplaced)
summary(lm2)
##
## Call:
## lm(formula = f2, data = gotplaced)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100997 -21894 7832 24235 123531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94161.3 40734.6 -2.312 0.02236 *
## gotplaced$work_yrs 1668.6 1233.8 1.352 0.17858
## gotplaced$s_avg 29977.9 9057.6 3.310 0.00121 **
## gotplaced$gmat_vpc -429.5 217.6 -1.973 0.05055 .
## gotplaced$sex 7555.2 7636.5 0.989 0.32431
## gotplaced$frstlang -14968.2 11886.1 -1.259 0.21016
## gotplaced$satis 20801.8 3086.6 6.739 4.58e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38620 on 131 degrees of freedom
## Multiple R-squared: 0.3585, Adjusted R-squared: 0.3291
## F-statistic: 12.2 on 6 and 131 DF, p-value: 7.177e-11
lm2$coefficients
## (Intercept) gotplaced$work_yrs gotplaced$s_avg
## -94161.2871 1668.5710 29977.9326
## gotplaced$gmat_vpc gotplaced$sex gotplaced$frstlang
## -429.5049 7555.2158 -14968.2102
## gotplaced$satis
## 20801.8085
Logistic Regression
lm3 <- glm(mba$isplaced ~.,family=binomial(link='logit'),data=mba)
## Warning: glm.fit: algorithm did not converge
summary(lm3)
##
## Call:
## glm(formula = mba$isplaced ~ ., family = binomial(link = "logit"),
## data = mba)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## 2.409e-06 2.409e-06 2.409e-06 2.409e-06 2.409e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.657e+01 1.238e+06 0 1
## age -6.348e-08 2.645e+04 0 1
## sex -7.085e-07 8.346e+04 0 1
## gmat_tot -5.808e-08 4.129e+03 0 1
## gmat_qpc 1.532e-07 1.163e+04 0 1
## gmat_vpc 1.403e-07 1.166e+04 0 1
## gmat_tpc -4.542e-08 1.688e+04 0 1
## s_avg 1.124e-06 1.902e+05 0 1
## f_avg -5.060e-07 9.006e+04 0 1
## quarter -4.420e-07 6.308e+04 0 1
## work_yrs 1.281e-07 2.628e+04 0 1
## frstlang 9.599e-07 1.712e+05 0 1
## salary -5.076e-11 2.433e+00 0 1
## satis -5.605e-07 4.988e+04 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 0.0000e+00 on 102 degrees of freedom
## Residual deviance: 5.9756e-10 on 89 degrees of freedom
## (171 observations deleted due to missingness)
## AIC: 28
##
## Number of Fisher Scoring iterations: 25
lm3$coefficients
## (Intercept) age sex gmat_tot gmat_qpc
## 2.656609e+01 -6.348145e-08 -7.084946e-07 -5.808314e-08 1.532370e-07
## gmat_vpc gmat_tpc s_avg f_avg quarter
## 1.403170e-07 -4.542382e-08 1.123515e-06 -5.059961e-07 -4.419518e-07
## work_yrs frstlang salary satis
## 1.280627e-07 9.598788e-07 -5.076355e-11 -5.605335e-07