R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Task 1 #Data import

library(readxl)
mydata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/HW.xlsx")
head(mydata)
## # A tibble: 6 × 5
##   Name      Age Height Weight Income
##   <chr>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 <NA>       NA     NA     NA     NA
## 2 Alice      25    170     70  40000
## 3 Bob        32    165     80  52000
## 4 Charlie    28    180     75  61000
## 5 Diana      45    160     90  72000
## 6 Edward     35    175     85  45000

#Explanation This dataset consists of 20 individuals, each with 4 characteristics:

Data manipulations

mydata2 <- mydata
colnames(mydata) <- c("ID", "years", "cm", "kg", "money")
head(mydata2)
## # A tibble: 6 × 5
##   Name      Age Height Weight Income
##   <chr>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 <NA>       NA     NA     NA     NA
## 2 Alice      25    170     70  40000
## 3 Bob        32    165     80  52000
## 4 Charlie    28    180     75  61000
## 5 Diana      45    160     90  72000
## 6 Edward     35    175     85  45000
mydata3 <- mydata [, -5]
head(mydata3)
## # A tibble: 6 × 4
##   ID      years    cm    kg
##   <chr>   <dbl> <dbl> <dbl>
## 1 <NA>       NA    NA    NA
## 2 Alice      25   170    70
## 3 Bob        32   165    80
## 4 Charlie    28   180    75
## 5 Diana      45   160    90
## 6 Edward     35   175    85

Descriptive statistics

mean(mydata2$Income, na.rm = TRUE)
## [1] 57050

This number showes that average income of people in my dataset is equal to 57050$.

median(mydata2$Income, na.rm = TRUE)
## [1] 56500

Half of people have income lower than 56500, while the other half has higher that 56500$.

sd(mydata2$Income, na.rm = TRUE)
## [1] 12437.1

Graphs

library(ggplot2)

ggplot(mydata, aes(x = years, y = money)) +
  geom_point(size = 3) +
  labs(title = "Income vs. Age", 
       x = "Age", 
       y = "Income")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#Explanation In this graph, we can see distribution of income and age. This is important in order to connect these two variables and make conclusions. We can clearly see that the older people have higher income, which can be consequence of different factors.

Task 2

library(ggplot2)
library(readxl)

MBAdata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/Task 2/Business School.xlsx", sheet = 1)
head(MBAdata)
## # A tibble: 6 × 9
##   `Student ID` `Undergrad Degree` `Undergrad Grade` `MBA Grade`
##          <dbl> <chr>                          <dbl>       <dbl>
## 1            1 Business                        68.4        90.2
## 2            2 Computer Science                70.2        68.7
## 3            3 Finance                         76.4        83.3
## 4            4 Business                        82.6        88.7
## 5            5 Finance                         76.9        75.4
## 6            6 Computer Science                83.3        82.1
## # ℹ 5 more variables: `Work Experience` <chr>, `Employability (Before)` <dbl>,
## #   `Employability (After)` <dbl>, Status <chr>, `Annual Salary` <dbl>

Graph - distribution of undergraduate degrees

library(ggplot2)

ggplot(MBAdata, aes(x = `Undergrad Degree`)) +
  geom_bar(fill = "pink", color = "black") +
  labs(title = "Distribution of Undergraduate Degrees",
       x = "Undergraduate Degree",
       y = "Count")

#Explanation The most common degree is Business.

Descriptive statistics

summary(MBAdata$`Annual Salary`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   87125  103500  109058  124000  340000

Histogram

ggplot(MBAdata, aes(x = `Annual Salary`)) +
  geom_histogram(binwidth = 5000, fill = "pink", color = "black") +
  scale_x_continuous(labels = scales::comma) +
  labs(title = "Distribution of Annual Salary",
       x = "Annual Salary",
       y = "Frequency") +
  theme_minimal()

#Explanation Distribution is skewed to the right.

Testing the hypothesis

t.test(MBAdata$`MBA Grade`, 
       mu = 74,
       alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  MBAdata$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
##  74.51764 77.56346
## sample estimates:
## mean of x 
##  76.04055

#Eplanation Since p value is lower than 5%, we can reject H0, concluding that mean is not 74 and the estimate of the mean is 76.04055.

library(effectsize)

cohens_d(MBAdata$`MBA Grade`, mu=74)
## Cohen's d |       95% CI
## ------------------------
## 0.27      | [0.07, 0.46]
## 
## - Deviation from a difference of 74.
interpret_cohens_d(0.27, rules = "sawilowsky2009")
## [1] "small"
## (Rules: sawilowsky2009)

#Explanation Based on the results, we can see that the effect is small.

Task 3

Appdata <- read_excel("~/Desktop/IMB/R Take Home Exam 2024/Task 3/Apartments.xlsx")
head(Appdata)
## # A tibble: 6 × 5
##     Age Distance Price Parking Balcony
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
## 1     7       28  1640       0       1
## 2    18        1  2800       1       0
## 3     7       28  1660       0       0
## 4    28       29  1850       0       1
## 5    18       18  1640       1       1
## 6    28       12  1770       0       1

Description:

Changing variables into factors

Appdata$ParkingFactor <- factor(Appdata$Parking,
                                levels = c(0,1),
                                labels = c("NO", "YES"))
head(Appdata)
## # A tibble: 6 × 6
##     Age Distance Price Parking Balcony ParkingFactor
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>        
## 1     7       28  1640       0       1 NO           
## 2    18        1  2800       1       0 YES          
## 3     7       28  1660       0       0 NO           
## 4    28       29  1850       0       1 NO           
## 5    18       18  1640       1       1 YES          
## 6    28       12  1770       0       1 NO
Appdata$BalconyFactor <- factor(Appdata$Balcony,
                                levels = c(0,1),
                                labels = c("NO", "YES"))
head(Appdata)
## # A tibble: 6 × 7
##     Age Distance Price Parking Balcony ParkingFactor BalconyFactor
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>         <fct>        
## 1     7       28  1640       0       1 NO            YES          
## 2    18        1  2800       1       0 YES           NO           
## 3     7       28  1660       0       0 NO            NO           
## 4    28       29  1850       0       1 NO            YES          
## 5    18       18  1640       1       1 YES           YES          
## 6    28       12  1770       0       1 NO            YES

Test hypothesis

t.test(Appdata$Price, 
       mu = 1900,
       alternative = "two.sided")
## 
##  One Sample t-test
## 
## data:  Appdata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941

#Explanation We can conclude, according to the p value, that H0 can be rejected. Estimated mean is 2018.941.

Simple regression

fitl <- lm(Price ~ Age,
           data = Appdata)
summary(fitl)
## 
## Call:
## lm(formula = Price ~ Age, data = Appdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401

#Explanation Interception is 2185.455. -8.975 shows that if the age of an apartments iincreases by 1 year, price will on average decrease by 8.975. Coefficient of determination: 5.302% of variability in Price is explained in linear effect of Age. Coefficient of correlation: square root of coefficient of determination, which equals to 0.2303.

Scatterplot matrix

library(car)
## Loading required package: carData
scatterplotMatrix(Appdata[ , c(-4,-5,-6,-7)],
                  smooth = FALSE)

#Explanation Judging from the scatterplot, we can see that there is no high multicolinearity.

Fit2 - Price = f(Age, Distance)

fit2 <- lm(Price ~ Age + Distance,
           data = Appdata)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = Appdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

VIF

vif(fit2)
##      Age Distance 
## 1.001845 1.001845

#Explanation As VIF is almost 1, we can conclude that dependent variables are almost completely unrelated.