Shikhar Kohli - PGP32117
03.10.2017
deansdilemma.df <- read.csv(paste("datasets/DeansDilemmaData.csv", sep=""))
placed <- subset(deansdilemma.df, Placement == "Placed")
median(deansdilemma.df$Salary)
[1] 240000
mytable <- with(deansdilemma.df, table(Placement))
mytable
Placement
Not Placed Placed
79 312
round(prop.table(mytable)[2]*100,2)
Placed
79.8
placed <- subset(deansdilemma.df, Placement == "Placed")
median(placed$Salary)
[1] 260000
placed <- subset(deansdilemma.df, Placement == "Placed")
by(placed$Salary, placed$Gender, median)
placed$Gender: F
[1] 240000
--------------------------------------------------------
placed$Gender: M
[1] 265000
placed <- subset(deansdilemma.df, Placement == "Placed")
hist(placed$Percent_MBA,
freq=TRUE,
breaks=6,
xlab="MBA Percentage")
Question 7 Create a dataframe called 'notplaced', that contains a subset of only those students who were NOT placed after their MBA
notplaced <- subset(deansdilemma.df, deansdilemma.df$Placement == "Not Placed")
Question 8** Generate two histograms side-by-side, visually comparing the MBA performance of Placed and Not Placed students, as shown below.
library(lattice)
histogram(~ deansdilemma.df$Percent_MBA | deansdilemma.df$Placement, data = deansdilemma.df)
Question 9 Generate two boxplots, one below the other, comparing the distribution of salaries of males and females who were placed, as shown below
boxplot( deansdilemma.df$Salary ~ deansdilemma.df$Gender, data = deansdilemma.df, plot=TRUE, staplewex = TRUE, horizontal = TRUE, names = c("Females","Males"))
Question 10
Create a dataframe called 'placedET', representing students who were placed after the MBA and who also gave some MBA entrance test before admission into the MBA program
placedET <- subset(x = deansdilemma.df, subset = deansdilemma.df$Placement == "Placed" & deansdilemma.df$Entrance_Test != "None")
Question 11
Draw a Scatter Plot Matrix for 3 variables – {Salary, Percent_MBA, Percentile_ET} using the dataframe placedET, as shown below.
placed <- subset(deansdilemma.df, Placement == "Placed")
placedET <- subset(placed, placed$S.TEST == 1)
library(car)
scatterplotMatrix(~Salary + Percent_MBA + Percentile_ET, data=placedET,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix via Deans Dilemma data")
Question 12
What is mean salary of all the students who were placed?
mean(placed$Salary)
[1] 274550
Question 13
Test whether the distribution of salaries is normal
shapiro.test(deansdilemma.df$Salary)
Shapiro-Wilk normality test
data: deansdilemma.df$Salary
W = 0.89789, p-value = 1.619e-15
qqnorm(deansdilemma.df$Salary)
qqline(deansdilemma.df$Salary)
Since the p-value is less than 0.05, we reject the null hypothesis that the population is normally distributed
Question 14 Create a table giving the mean salary of the men and the women MBAs who were placed?
by(deansdilemma.df$Salary, deansdilemma.df$Gender, mean)
deansdilemma.df$Gender: F
[1] 193288.2
--------------------------------------------------------
deansdilemma.df$Gender: M
[1] 231484.8
Question 15 Visualize the mean salary of men and women who were placed, using plotmeans()
gplots::plotmeans(deansdilemma.df$Salary ~ deansdilemma.df$Gender, data = deansdilemma.df)
Question 16 Create a table giving the variance of the salary of the men and women MBAs who were placed?
by(data = deansdilemma.df$Salary, deansdilemma.df$Gender, FUN = var)
deansdilemma.df$Gender: F
[1] 15840147717
--------------------------------------------------------
deansdilemma.df$Gender: M
[1] 20303338173
Question 17 Use a suitable statistical test to compare whether the variances are significantly different?
placed <- subset(deansdilemma.df, Placement == "Placed")
res.ftest <- var.test(placed$Salary ~ placed$Gender, data = placed)
res.ftest
F test to compare two variances
data: placed$Salary by placed$Gender
F = 0.55675, num df = 96, denom df = 214, p-value = 0.00135
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.3999212 0.7927360
sample estimates:
ratio of variances
0.5567478
Question 18 Use a suitable statistical test to compare whether there is a significant difference between the salaries of men and women?
res.ttest <- t.test(placed$Salary ~ placed$Gender, data = placed)
res.ttest
Welch Two Sample t-test
data: placed$Salary by placed$Gender
t = -3.0757, df = 243.03, p-value = 0.00234
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-51138.42 -11209.22
sample estimates:
mean in group F mean in group M
253068.0 284241.9
Create data frames for engineers and non-engineers
nonengineering.df <- subset(deansdilemma.df, deansdilemma.df$Course_Degree != "Engineering")
engineering.df <- subset(deansdilemma.df, deansdilemma.df$Course_Degree == "Engineering")
ENGINEERING
Question 14 Create a table giving the mean salary of the men and the women MBAs who were placed?
by(engineering.df$Salary, engineering.df$Gender, mean)
engineering.df$Gender: F
[1] 187000
--------------------------------------------------------
engineering.df$Gender: M
[1] 292074.1
Question 15 Visualize the mean salary of men and women who were placed, using plotmeans()
gplots::plotmeans(engineering.df$Salary ~ engineering.df$Gender, data = engineering.df)
Question 16 Create a table giving the variance of the salary of the men and women MBAs who were placed?
by(data = engineering.df$Salary, engineering.df$Gender, FUN = var)
engineering.df$Gender: F
[1] 19312222222
--------------------------------------------------------
engineering.df$Gender: M
[1] 27796148148
Question 17 Use a suitable statistical test to compare whether the variances are significantly different?
placed <- subset(engineering.df, Placement == "Placed")
res.ftest <- var.test(placed$Salary ~ placed$Gender, data = placed)
res.ftest
F test to compare two variances
data: placed$Salary by placed$Gender
F = 0.27253, num df = 6, denom df = 22, p-value = 0.1122
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.08921974 1.40097801
sample estimates:
ratio of variances
0.2725341
Question 18 Use a suitable statistical test to compare whether there is a significant difference between the salaries of men and women?
res.ttest <- t.test(placed$Salary ~ placed$Gender, data = placed)
res.ttest
Welch Two Sample t-test
data: placed$Salary by placed$Gender
t = -2.18, df = 20.06, p-value = 0.04134
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-148173.503 -3279.913
sample estimates:
mean in group F mean in group M
267142.9 342869.6
NON ENGINEERING
Question 14 Create a table giving the mean salary of the men and the women MBAs who were placed?
by(nonengineering.df$Salary, nonengineering.df$Gender, mean)
nonengineering.df$Gender: F
[1] 193825.6
--------------------------------------------------------
nonengineering.df$Gender: M
[1] 224582.3
Question 15 Visualize the mean salary of men and women who were placed, using plotmeans()
gplots::plotmeans(nonengineering.df$Salary ~ nonengineering.df$Gender, data = nonengineering.df)
Question 16 Create a table giving the variance of the salary of the men and women MBAs who were placed?
by(data = nonengineering.df$Salary, nonengineering.df$Gender, FUN = var)
nonengineering.df$Gender: F
[1] 15703615544
--------------------------------------------------------
nonengineering.df$Gender: M
[1] 19096049346
Question 17 Use a suitable statistical test to compare whether the variances are significantly different?
placed <- subset(nonengineering.df, Placement == "Placed")
res.ftest <- var.test(placed$Salary ~ placed$Gender, data = placed)
res.ftest
F test to compare two variances
data: placed$Salary by placed$Gender
F = 0.63307, num df = 89, denom df = 191, p-value = 0.01571
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.4479942 0.9159224
sample estimates:
ratio of variances
0.6330683
Question 18 Use a suitable statistical test to compare whether there is a significant difference between the salaries of men and women?
res.ttest <- t.test(placed$Salary ~ placed$Gender, data = placed)
res.ttest
Welch Two Sample t-test
data: placed$Salary by placed$Gender
t = -2.4149, df = 214.74, p-value = 0.01658
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-45851.09 -4639.74
sample estimates:
mean in group F mean in group M
251973.3 277218.8