R Markdown

Eseniya Ganina

##Task 2

library(readxl)
Business_School <- read_excel("~/Desktop/R Take Home Exam 2025/Business School.xlsx")
#View(Business_School)

Reading data set and converting to the data frame format

#install.packages("readxl")
library(readxl)
mydata <- read_xlsx("~/Desktop/R Take Home Exam 2025/Business School.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
##   Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1          1         Business            68.4      90.2              No
## 2          2 Computer Science            70.2      68.7             Yes
## 3          3          Finance            76.4      83.3              No
## 4          4         Business            82.6      88.7              No
## 5          5          Finance            76.9      75.4              No
## 6          6 Computer Science            83.3      82.1              No
##   Employability (Before) Employability (After) Status Annual Salary
## 1                    252                   276 Placed        111000
## 2                    101                   119 Placed        107000
## 3                    401                   462 Placed        109000
## 4                    287                   342 Placed        148000
## 5                    275                   347 Placed        255500
## 6                    254                   313 Placed        103500
  1. Distribution of variables
library(ggplot2)
ggplot(mydata, aes(x = `Undergrad Degree`)) +
  geom_bar(colour = "deeppink", fill = "rosybrown") +
  ylab("Frequency") +
  theme_minimal() +
  geom_text(stat = "count", 
            aes(label = ..count..), 
            vjust = -0.3) 
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Business is the most common degree

  1. Descriptive statistics of the Annual salary and its distribution
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describeBy(x = mydata$`Annual Salary`,
group = mydata$`Undergrad Degree`)
## 
##  Descriptive statistics by group 
## group: Art
##    vars n     mean       sd median  trimmed     mad   min    max range skew
## X1    1 6 101916.7 19192.23 108250 101916.7 15567.3 75500 124000 48500 -0.3
##    kurtosis      se
## X1    -1.88 7835.19
## ------------------------------------------------------------ 
## group: Business
##    vars  n     mean       sd median  trimmed     mad   min    max  range skew
## X1    1 35 111967.1 53060.41 103500 107201.7 31134.6 20000 340000 320000 2.14
##    kurtosis      se
## X1     7.38 8968.85
## ------------------------------------------------------------ 
## group: Computer Science
##    vars  n     mean       sd median trimmed     mad   min    max  range skew
## X1    1 25 102212.4 25046.44  96500  100930 13343.4 56000 168000 112000  0.8
##    kurtosis      se
## X1     0.58 5009.29
## ------------------------------------------------------------ 
## group: Engineering
##    vars n     mean       sd median  trimmed     mad   min    max  range skew
## X1    1 9 125055.6 36252.97 114500 125055.6 16308.6 84000 205500 121500 1.06
##    kurtosis       se
## X1    -0.02 12084.32
## ------------------------------------------------------------ 
## group: Finance
##    vars  n     mean      sd median  trimmed      mad   min    max  range skew
## X1    1 25 107785.6 42456.8 102150 103344.8 23943.99 38000 255500 217500 1.62
##    kurtosis      se
## X1     3.84 8491.36
library(ggplot2)

ggplot(mydata, aes(x = `Annual Salary`)) +
  geom_histogram(binwidth = 10000, fill = "darkolivegreen1", colour = "black") +
  scale_x_continuous(labels = scales::comma) +  #format y axis
  labs(
    title = "Distribution of Annual Salary",
    x = "Annual Salary",
    y = "Frequency"
  )

The graph is asymmetrical and skewed to the right with most individuals earing approximately between $50,000 and $150,000.The distribution peaks around $100,000, indicating that this is the most common salary range.

  1. Hyphothesis testing 𝐻0:𝜇MBA Grade = 74. 𝐻1:𝜇MBA Grade ≠ 74.
mean(mydata$`MBA Grade`)
## [1] 76.04055
sd(mydata$`MBA Grade`)
## [1] 7.675114
library(ggplot2)
ggplot(NULL, aes(c(-4, 4))) +
  geom_line(stat = "function", fun = dt, args = list (df = 99)) +
  ylab("Perfomance") + 
  xlab("MBA grades") +
  labs(title="Distribution of MBA grades")

qt(p = 0.025, df = 99, lower.tail = FALSE)
## [1] 1.984217
qt(p = 0.025, df = 99, lower.tail = TRUE)
## [1] -1.984217
t.test(mydata$`MBA Grade`,
       mu = 74,
       alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  mydata$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
##  74.51764 77.56346
## sample estimates:
## mean of x 
##  76.04055

Based on a sample data we reject the H0 at p< 0.009. We find out that the average MBA grade increased

Cohen’d statistics

#install.packages("effectsize")
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize::cohens_d(mydata$`MBA Grade`, mu=74)
## Cohen's d |       95% CI
## ------------------------
## 0.27      | [0.07, 0.46]
## 
## - Deviation from a difference of 74.
effectsize::interpret_cohens_d(0.27, rules ="sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)