library(readxl)
mydata <- read_xlsx("./Take_home_exam.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
## school sex age famrel freetime goout absences Dalc Walc
## 1 GP 2 18 4 3 4 4 1 1
## 2 GP 2 17 5 3 3 2 1 1
## 3 GP 2 15 4 3 2 6 2 3
## 4 GP 2 15 3 2 2 0 1 1
## 5 GP 2 16 4 3 2 0 1 2
## 6 GP 1 16 5 4 2 6 1 2
Indicated the gender of the students.
mydata$sex_factor <- factor(mydata$sex,
levels = c(1, 2),
labels = c("male", "female"))
head(mydata)
## school sex age famrel freetime goout absences Dalc Walc sex_factor
## 1 GP 2 18 4 3 4 4 1 1 female
## 2 GP 2 17 5 3 3 2 1 1 female
## 3 GP 2 15 4 3 2 6 2 3 female
## 4 GP 2 15 3 2 2 0 1 1 female
## 5 GP 2 16 4 3 2 0 1 2 female
## 6 GP 1 16 5 4 2 6 1 2 male
Removed the school column, since all students are from the same school and the sex column, since we have column sex_factor.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata1 <- mydata %>% select(-school, -sex)
head (mydata1)
## age famrel freetime goout absences Dalc Walc sex_factor
## 1 18 4 3 4 4 1 1 female
## 2 17 5 3 3 2 1 1 female
## 3 15 4 3 2 6 2 3 female
## 4 15 3 2 2 0 1 1 female
## 5 16 4 3 2 0 1 2 female
## 6 16 5 4 2 6 1 2 male
Selected only students who drinks alcohol on weekends more than others (more than 2).
mydata2 <- mydata1[ mydata$Walc >= 2 , ]
head (mydata2)
## age famrel freetime goout absences Dalc Walc sex_factor
## 3 15 4 3 2 6 2 3 female
## 5 16 4 3 2 0 1 2 female
## 6 16 5 4 2 6 1 2 male
## 11 15 3 3 3 2 1 2 female
## 13 15 4 3 3 0 1 3 male
## 14 15 5 4 3 0 1 2 male
Descriptive statistics for students that drink alcohol on weekends more than others, removing column sex_factor.
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
round(stat.desc(mydata2[ , c(-8) ]), 2)
## age famrel freetime goout absences Dalc Walc
## nbr.val 53.00 53.00 53.00 53.00 53.00 53.00 53.00
## nbr.null 0.00 0.00 0.00 0.00 20.00 0.00 0.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## min 15.00 1.00 1.00 1.00 0.00 1.00 2.00
## max 17.00 5.00 5.00 5.00 16.00 5.00 5.00
## range 2.00 4.00 4.00 4.00 16.00 4.00 3.00
## sum 824.00 205.00 171.00 170.00 177.00 94.00 162.00
## median 16.00 4.00 3.00 3.00 2.00 2.00 3.00
## mean 15.55 3.87 3.23 3.21 3.34 1.77 3.06
## SE.mean 0.07 0.15 0.13 0.13 0.54 0.14 0.13
## CI.mean.0.95 0.15 0.29 0.27 0.27 1.08 0.29 0.26
## var 0.29 1.12 0.95 0.94 15.27 1.10 0.86
## std.dev 0.54 1.06 0.97 0.97 3.91 1.05 0.93
## coef.var 0.03 0.27 0.30 0.30 1.17 0.59 0.30
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:pastecs':
##
## extract
library(dplyr)
mydata_long <- mydata1 %>%
pivot_longer(cols = c(Dalc, Walc),
names_to = "day_type",
values_to = "alcohol_consumption")
library(ggplot2)
ggplot(mydata_long, aes(x = factor(alcohol_consumption), fill = sex_factor)) +
geom_bar(position = "dodge") +
facet_wrap(~ day_type) +
labs(x = "Alcohol Consumption Level (1 to 5)", y = "Count",
title = "Alcohol Consumption by Gender: Workdays vs Weekends") +
theme_minimal()
library(readxl)
mydata3 <- read_xlsx("./Business_School.xlsx")
mydata3 <- as.data.frame(mydata3)
head(mydata3)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(ggplot2)
ggplot(mydata3, aes(x = `Undergrad Degree`)) +
geom_bar(binwidth = 2, colour="black", fill = "pink") +
labs(x = "Undergraduate Degree", y = "Count",
title = "Distribution of Undergraduate Degrees") +
theme_minimal()
## Warning in geom_bar(binwidth = 2, colour = "black", fill = "pink"): Ignoring
## unknown parameters: `binwidth`
The most common degree is Business.
library(pastecs)
round(stat.desc(mydata3[ , "Annual Salary"]), 2)
## nbr.val nbr.null nbr.na min max range
## 1.000000e+02 0.000000e+00 0.000000e+00 2.000000e+04 3.400000e+05 3.200000e+05
## sum median mean SE.mean CI.mean.0.95 var
## 1.090580e+07 1.035000e+05 1.090580e+05 4.150150e+03 8.234800e+03 1.722373e+09
## std.dev coef.var
## 4.150149e+04 3.800000e-01
library(ggplot2)
ggplot(mydata3, aes(x = `Annual Salary`)) +
geom_bar(binwidth = 2, colour="violet") +
labs(x = "Annual Salary", y = "Count",
title = "Distribution of Annual Salary") +
theme_minimal()
## Warning in geom_bar(binwidth = 2, colour = "violet"): Ignoring unknown
## parameters: `binwidth`
Checked the data to see the mean of MBA grade for comparison with the results of t-test.
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describeBy(mydata3$`MBA Grade`)
## Warning in describeBy(mydata3$`MBA Grade`): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 100 76.04 7.68 76.38 76.18 8.29 58.14 95 36.86 -0.15 -0.59 0.77
t_test_result <- t.test(mydata3$`MBA Grade`, mu = 74)
print(t_test_result)
##
## One Sample t-test
##
## data: mydata3$`MBA Grade`
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
p-value is less than 5%, so we reject the null hypothesis => true average of MBA grade is not equal to 74, but is 76.04055.
library(readxl)
mydata4 <- read_xlsx("./Apartments.xlsx")
mydata3 <- as.data.frame(mydata4)
head(mydata4)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata4$ParkingFactor <- factor(mydata4$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata4$BalconyFactor <- factor(mydata4$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
head(mydata4)
## # A tibble: 6 × 7
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 7 28 1640 0 1 No Yes
## 2 18 1 2800 1 0 Yes No
## 3 7 28 1660 0 0 No No
## 4 28 29 1850 0 1 No Yes
## 5 18 18 1640 1 1 Yes Yes
## 6 28 12 1770 0 1 No Yes
mydata4$ID <- 1:nrow(mydata4)
head (mydata4)
## # A tibble: 6 × 8
## Age Distance Price Parking Balcony ParkingFactor BalconyFactor ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <int>
## 1 7 28 1640 0 1 No Yes 1
## 2 18 1 2800 1 0 Yes No 2
## 3 7 28 1660 0 0 No No 3
## 4 28 29 1850 0 1 No Yes 4
## 5 18 18 1640 1 1 Yes Yes 5
## 6 28 12 1770 0 1 No Yes 6
t_test_result1 <- t.test(mydata4$Price, mu = 1900)
print(t_test_result1)
##
## One Sample t-test
##
## data: mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
p-value is less than 5%, so we reject the null hypothesis => true average is not equal to 1900, but is 2018.941.
fit1 <- lm(Price ~ Age,
data = mydata4)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplotMatrix(~ Price + Age + Distance, data = mydata4,
main = "Scatterplot Matrix between Price, Age, and Distance",
smooth = FALSE)
fit2 <- lm(Price ~ Age + Distance,
data = mydata4)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
vif(fit2)
## Age Distance
## 1.001845 1.001845
Both values are lower than 5, so there is no very strong correlation between explanatory variables, so both variables can be used.
mydata4$StdResid <- round(rstandard(fit2), 3)
hist(mydata4$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
Residuals outside the boundaries [-3; +3] should be removed, as they can be outliers. In our graph we do not see such residuals.
mydata4$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata4$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata4[order(-mydata4$CooksD), c("ID", "CooksD")], 20)
## # A tibble: 20 × 2
## ID CooksD
## <int> <dbl>
## 1 38 0.32
## 2 55 0.104
## 3 33 0.069
## 4 53 0.066
## 5 22 0.061
## 6 39 0.038
## 7 58 0.037
## 8 25 0.034
## 9 57 0.032
## 10 2 0.03
## 11 31 0.03
## 12 61 0.03
## 13 27 0.023
## 14 48 0.022
## 15 80 0.022
## 16 11 0.02
## 17 70 0.02
## 18 46 0.019
## 19 78 0.019
## 20 10 0.017
library(dplyr)
mydata4 <- mydata4 %>%
filter(!ID %in% c(38, 55, 33, 53, 22))
In the histogram of Cooks distances we can notice a gap between 0.15 and 0.30 => there are potential units with large influence. We arrange the values in descending order and see that the first 5 values differ too much from the others, so we remove them and try to graph histogram of Cooks distances one more time.
fit2 <- lm(Price ~ Age + Distance,
data = mydata4)
mydata4$StdResid <- round(rstandard(fit2), 3)
mydata4$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata4$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
Now there are no gaps in the histogram of Cooks distances.
library(car)
mydata4$StdFittedValues <- scale(fit2$fitted.values)
mydata4$StdResid <- scale(residuals(fit2))
scatterplot(y = mydata4$StdResid,
x = mydata4$StdFittedValues,
ylab = "Standardized Residuals",
xlab = "Standardized Fitted Values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
On the graph, we can see an uneven distribution, which means there may be heteroskedasticity. We check this assumption with Breusch-Pagan test.
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.738591
## Prob > Chi2 = 0.1873174
The null hypothesis that the variance of errors is constant cannot be rejected (p-value is above 0.05) => we assume homoskedasticity.
library(dplyr)
mydata4 <- mydata4 %>%
filter(!ID %in% c(38, 55, 33, 53, 22))
fit2 <- lm(Price ~ Age + Distance,
data = mydata4)
mydata4$StdResid <- round(rstandard(fit2), 3)
mydata4$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata4$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata4$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata4$StdResid
## W = 0.94154, p-value = 0.001166
Shapiro-Wilk normality test: p-value is lower than 0.05, so the null hypothesis that the standardized residuals are normally distributed can be rejected, which means that the errors in the population are most likely not normally distributed. Although we have a big number of samples (n>30), so due to CLM a violation of this assumption would not significantly affect our results.
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
fit3 <- lm(Price ~ Age + Distance + ParkingFactor + BalconyFactor,
data = mydata4)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingFactor + BalconyFactor
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
p-value is 0.1135, so we cannot reject the null hypothesis that the two models are significantly different and cannot say that model fit3 fits data better than model fit2.
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingFactor + BalconyFactor,
## data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingFactorYes 128.700 60.801 2.117 0.0376 *
## BalconyFactorYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
mydata4$FittedValues <- fitted.values(fit3)
mydata4$Residuals <- residuals(fit3)
head(mydata4[ , colnames(mydata4) %in% c("ID", "Price", "FittedValues",
"Residuals")])
## # A tibble: 6 × 4
## Price ID FittedValues Residuals
## <dbl> <int> <dbl> <dbl>
## 1 1640 1 1729. -88.6
## 2 2800 2 2357. 443.
## 3 1660 3 1723. -62.6
## 4 1850 4 1539. 311.
## 5 1640 5 1989. -349.
## 6 1770 6 1913. -143.
Based on the estimated regression function, we would expect the price per m2 for the apartment ID2 will be 2357 euros. The actual price was 2800 euros per m2, so the residual is 443 euros per m2.