##Task 2
library(readxl)
Business_School <- read_excel("~/Desktop/R Take Home Exam 2025/Business School.xlsx")
#View(Business_School)
Reading data set and converting to the data frame format
#install.packages("readxl")
library(readxl)
mydata <- read_xlsx("~/Desktop/R Take Home Exam 2025/Business School.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(ggplot2)
ggplot(mydata, aes(x = `Undergrad Degree`)) +
geom_bar(colour = "deeppink", fill = "rosybrown") +
ylab("Frequency") +
theme_minimal() +
geom_text(stat = "count",
aes(label = ..count..),
vjust = -0.3)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Business is the most common degree
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describeBy(x = mydata$`Annual Salary`,
group = mydata$`Undergrad Degree`)
##
## Descriptive statistics by group
## group: Art
## vars n mean sd median trimmed mad min max range skew
## X1 1 6 101916.7 19192.23 108250 101916.7 15567.3 75500 124000 48500 -0.3
## kurtosis se
## X1 -1.88 7835.19
## ------------------------------------------------------------
## group: Business
## vars n mean sd median trimmed mad min max range skew
## X1 1 35 111967.1 53060.41 103500 107201.7 31134.6 20000 340000 320000 2.14
## kurtosis se
## X1 7.38 8968.85
## ------------------------------------------------------------
## group: Computer Science
## vars n mean sd median trimmed mad min max range skew
## X1 1 25 102212.4 25046.44 96500 100930 13343.4 56000 168000 112000 0.8
## kurtosis se
## X1 0.58 5009.29
## ------------------------------------------------------------
## group: Engineering
## vars n mean sd median trimmed mad min max range skew
## X1 1 9 125055.6 36252.97 114500 125055.6 16308.6 84000 205500 121500 1.06
## kurtosis se
## X1 -0.02 12084.32
## ------------------------------------------------------------
## group: Finance
## vars n mean sd median trimmed mad min max range skew
## X1 1 25 107785.6 42456.8 102150 103344.8 23943.99 38000 255500 217500 1.62
## kurtosis se
## X1 3.84 8491.36
library(ggplot2)
ggplot(mydata, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 10000, fill = "darkolivegreen1", colour = "black") +
scale_x_continuous(labels = scales::comma) + #format y axis
labs(
title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency"
)
The graph is asymmetrical and skewed to the right with most individuals
earing approximately between $50,000 and $150,000.The distribution peaks
around $100,000, indicating that this is the most common salary
range.
mean(mydata$`MBA Grade`)
## [1] 76.04055
sd(mydata$`MBA Grade`)
## [1] 7.675114
library(ggplot2)
ggplot(NULL, aes(c(-4, 4))) +
geom_line(stat = "function", fun = dt, args = list (df = 99)) +
ylab("Perfomance") +
xlab("MBA grades") +
labs(title="Distribution of MBA grades")
qt(p = 0.025, df = 99, lower.tail = FALSE)
## [1] 1.984217
qt(p = 0.025, df = 99, lower.tail = TRUE)
## [1] -1.984217
t.test(mydata$`MBA Grade`,
mu = 74,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
Based on a sample data we reject the H0 at p< 0.009. We find out that the average MBA grade increased
Cohen’d statistics
#install.packages("effectsize")
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
effectsize::cohens_d(mydata$`MBA Grade`, mu=74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
effectsize::interpret_cohens_d(0.27, rules ="sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)