This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
Task 1 #Data import
library(readxl)
mydata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/HW.xlsx")
head(mydata)
## # A tibble: 6 × 5
## Name Age Height Weight Income
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> NA NA NA NA
## 2 Alice 25 170 70 40000
## 3 Bob 32 165 80 52000
## 4 Charlie 28 180 75 61000
## 5 Diana 45 160 90 72000
## 6 Edward 35 175 85 45000
#Explanation This dataset consists of 20 individuals, each with 4 characteristics:
Data manipulations
mydata2 <- mydata
colnames(mydata) <- c("ID", "years", "cm", "kg", "money")
head(mydata2)
## # A tibble: 6 × 5
## Name Age Height Weight Income
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> NA NA NA NA
## 2 Alice 25 170 70 40000
## 3 Bob 32 165 80 52000
## 4 Charlie 28 180 75 61000
## 5 Diana 45 160 90 72000
## 6 Edward 35 175 85 45000
mydata3 <- mydata [, -5]
head(mydata3)
## # A tibble: 6 × 4
## ID years cm kg
## <chr> <dbl> <dbl> <dbl>
## 1 <NA> NA NA NA
## 2 Alice 25 170 70
## 3 Bob 32 165 80
## 4 Charlie 28 180 75
## 5 Diana 45 160 90
## 6 Edward 35 175 85
Descriptive statistics
mean(mydata2$Income, na.rm = TRUE)
## [1] 57050
This number showes that average income of people in my dataset is equal to 57050$.
median(mydata2$Income, na.rm = TRUE)
## [1] 56500
Half of people have income lower than 56500, while the other half has higher that 56500$.
sd(mydata2$Income, na.rm = TRUE)
## [1] 12437.1
Graphs
library(ggplot2)
ggplot(mydata, aes(x = years, y = money)) +
geom_point(size = 3) +
labs(title = "Income vs. Age",
x = "Age",
y = "Income")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
#Explanation In this graph, we can see distribution of income and age. This is important in order to connect these two variables and make conclusions. We can clearly see that the older people have higher income, which can be consequence of different factors.
Task 2
library(ggplot2)
library(readxl)
MBAdata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/Task 2/Business School.xlsx", sheet = 1)
head(MBAdata)
## # A tibble: 6 × 9
## `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
## <dbl> <chr> <dbl> <dbl>
## 1 1 Business 68.4 90.2
## 2 2 Computer Science 70.2 68.7
## 3 3 Finance 76.4 83.3
## 4 4 Business 82.6 88.7
## 5 5 Finance 76.9 75.4
## 6 6 Computer Science 83.3 82.1
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## # `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>
Graph - distribution of undergraduate degrees
library(ggplot2)
ggplot(MBAdata, aes(x = `Undergrad Degree`)) +
geom_bar(fill = "pink", color = "black") +
labs(title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree",
y = "Count")
#Explanation The most common degree is Business.
Descriptive statistics
summary(MBAdata$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
Histogram
ggplot(MBAdata, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 5000, fill = "pink", color = "black") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency") +
theme_minimal()
#Explanation Distribution is skewed to the right.
Testing the hypothesis
t.test(MBAdata$`MBA Grade`,
mu = 74,
alternative = "two.sided")
##
## One Sample t-test
##
## data: MBAdata$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
#Eplanation Since p value is lower than 5%, we can reject H0, concluding that mean is not 74 and the estimate of the mean is 76.04055.
library(effectsize)
cohens_d(MBAdata$`MBA Grade`, mu=74)
## Cohen's d | 95% CI
## ------------------------
## 0.27 | [0.07, 0.46]
##
## - Deviation from a difference of 74.
interpret_cohens_d(0.27, rules = "sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)
#Explanation Based on the results, we can see that the effect is small.
Task 3
Appdata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/Task 3/Apartments.xlsx")
head(Appdata)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
Changing variables into factors
Appdata$ParkingFactor <- factor(Appdata$Parking,
levels = c(0,1),
labels = c("NO", "YES"))
head(Appdata)
## # A tibble: 6 × 6
## Age Distance Price Parking Balcony ParkingFactor
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7 28 1640 0 1 NO
## 2 18 1 2800 1 0 YES
## 3 7 28 1660 0 0 NO
## 4 28 29 1850 0 1 NO
## 5 18 18 1640 1 1 YES
## 6 28 12 1770 0 1 NO
Appdata$BalconyFactor <- factor(Appdata$Balcony,
levels = c(0,1),
labels = c("NO", "YES"))
head(Appdata)
## # A tibble: 6 × 7
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 7 28 1640 0 1 NO YES
## 2 18 1 2800 1 0 YES NO
## 3 7 28 1660 0 0 NO NO
## 4 28 29 1850 0 1 NO YES
## 5 18 18 1640 1 1 YES YES
## 6 28 12 1770 0 1 NO YES
Test hypothesis
t.test(Appdata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: Appdata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
#Explanation We can conclude, according to the p value, that H0 can be rejected. Estimated mean is 2018.941.
Simple regression
fitl <- lm(Price ~ Age,
data = Appdata)
summary(fitl)
##
## Call:
## lm(formula = Price ~ Age, data = Appdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
#Explanation Interception is 2185.455. -8.975 shows that if the age of an apartments iincreases by 1 year, price will on average decrease by 8.975. Coefficient of determination: 5.302% of variability in Price is explained in linear effect of Age. Coefficient of correlation: square root of coefficient of determination, which equals to 0.2303.
Scatterplot matrix
library(car)
## Loading required package: carData
scatterplotMatrix(Appdata[ , c(-4,-5,-6,-7)],
smooth = FALSE)
#Explanation Judging from the scatterplot, we can see that there is no
high multicolinearity.
Fit2 - Price = f(Age, Distance)
fit2 <- lm(Price ~ Age + Distance,
data = Appdata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Appdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
VIF
vif(fit2)
## Age Distance
## 1.001845 1.001845
#Explanation As VIF is almost 1, we can conclude that dependent variables are almost completely unrelated.