Author: Hui-Ju Huang

1. Import the dataset Apartments.xlsx

# install.packages("readxl")

library(readxl)

mydata <- read_xlsx("./Apartments.xlsx")
head(mydata)
## # A tibble: 6 × 5
##     Age Distance Price Parking Balcony
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl>
## 1     7       28  1640       0       1
## 2    18        1  2800       1       0
## 3     7       28  1660       0       0
## 4    28       29  1850       0       1
## 5    18       18  1640       1       1
## 6    28       12  1770       0       1

Description:

2. Change categorical variables into factors. (0.5 p)

mydata$ParkingF <- factor(mydata$Parking, 
                         levels = c(0, 1), 
                         labels = c("No", "Yes"))

mydata$BalconyF <- factor(mydata$Balcony, 
                         levels = c(0, 1), 
                         labels = c("No", "Yes"))
head(mydata)
## # A tibble: 6 × 7
##     Age Distance Price Parking Balcony ParkingF BalconyF
##   <dbl>    <dbl> <dbl>   <dbl>   <dbl> <fct>    <fct>   
## 1     7       28  1640       0       1 No       Yes     
## 2    18        1  2800       1       0 Yes      No      
## 3     7       28  1660       0       0 No       No      
## 4    28       29  1850       0       1 No       Yes     
## 5    18       18  1640       1       1 Yes      Yes     
## 6    28       12  1770       0       1 No       Yes
library(pastecs)
round(stat.desc(mydata[c("Age", "Distance", "Price")]), 2)
##                  Age Distance     Price
## nbr.val        85.00    85.00     85.00
## nbr.null        0.00     0.00      0.00
## nbr.na          0.00     0.00      0.00
## min             1.00     1.00   1400.00
## max            45.00    45.00   2820.00
## range          44.00    44.00   1420.00
## sum          1577.00  1209.00 171610.00
## median         18.00    12.00   1950.00
## mean           18.55    14.22   2018.94
## SE.mean         1.05     1.23     40.98
## CI.mean.0.95    2.09     2.45     81.50
## var            93.96   129.44 142764.34
## std.dev         9.69    11.38    377.84
## coef.var        0.52     0.80      0.19

3. Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude? (1 p)

shapiro.test(mydata$Price)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$Price
## W = 0.94017, p-value = 0.0006513
wilcox.test(mydata$Price,
            mu = 1900,
            correct = FALSE)
## 
##  Wilcoxon signed rank test
## 
## data:  mydata$Price
## V = 2328, p-value = 0.02828
## alternative hypothesis: true location is not equal to 1900
library(effectsize)
effectsize(wilcox.test(mydata$Price, 
                       mu = 1900, 
                       correct = FALSE))
## r (rank biserial) |       95% CI
## --------------------------------
## 0.27              | [0.04, 0.48]
## 
## - Deviation from a difference of 1900.
interpret_rank_biserial(0.27, rules="funder2019")
## [1] "medium"
## (Rules: funder2019)

4. Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination. (2 p)

fit1 <- lm(Price ~ Age, data = mydata)
summary(fit1)
## 
## Call:
## lm(formula = Price ~ Age, data = mydata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401
cor_coef <- sqrt(summary(fit1)$r.squared)
cor_coef
## [1] 0.230255

5. Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity. (0.5 p)

library(car)
## Loading required package: carData
scatterplotMatrix(mydata[c("Price", "Age", "Distance")],
                  smooth = FALSE)

6. Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance, data = mydata)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

7. Chech the multicolinearity with VIF statistics. Explain the findings. (1 p)

library(car)
vif(fit2) #Checking multicolinearity
##      Age Distance 
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845

8. Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence). (2 p)

mydata$StdResid <- round(rstandard(fit2), 3)

hist(mydata$StdResid,
     xlab = "Standardized residuals",
     ylab = "Frequency",
     main = "Histogram of standardized residuals")

mydata$CooksD <- round(cooks.distance(fit2), 3)

hist(mydata$CooksD,
     xlab = "Cooks distance",
     ylab = "Frequency",
     main = "Histogram of Cooks distances")

head(mydata[order(-mydata$CooksD), c("CooksD")], 10)
## # A tibble: 10 × 1
##    CooksD
##     <dbl>
##  1  0.32 
##  2  0.104
##  3  0.069
##  4  0.066
##  5  0.061
##  6  0.038
##  7  0.037
##  8  0.034
##  9  0.032
## 10  0.03
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata <- mydata %>%
  filter(!CooksD %in% c(0.320, 0.104, 0.069, 0.066, 0.061))

9. Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings. (1 p)

fit2.1 <- lm(Price ~ Age + Distance, data = mydata)

mydata$StdResid <- round(rstandard(fit2.1), 3)
mydata$StdFittedValues <- scale(fit2.1$fitted.values)

library(car)
scatterplot(y = mydata$StdResid, x = mydata$StdFittedValues,
            ylab = "Standardized residuals",
            xlab = "Standardized fitted values",
            boxplots = FALSE,
            regLine = FALSE,
            smooth = FALSE)

10. Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings. (1 p)

hist(mydata$StdResid,
     xlab = "Standardized residuals",
     ylab = "Frequency",
     main = "Histogram of standardized residuals")

shapiro.test(mydata$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$StdResid
## W = 0.94154, p-value = 0.001166

11. Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients. (2 p)

summary(fit2.1)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13
sqrt(summary(fit2.1)$r.squared)
## [1] 0.732187

12. Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF, data = mydata)

13. With function anova check if model fit3 fits data better than model fit2. (1 p)

anova(fit2.1, fit3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1     77 5077362                           
## 2     75 4791128  2    286234 2.2403 0.1135

14. Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output? (2 p)

summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.93 -198.19  -53.64  186.73  518.34 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2393.316     93.930  25.480  < 2e-16 ***
## Age           -7.970      3.191  -2.498   0.0147 *  
## Distance     -21.961      2.830  -7.762 3.39e-11 ***
## ParkingFYes  128.700     60.801   2.117   0.0376 *  
## BalconyFYes    6.032     57.307   0.105   0.9165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared:  0.5623, Adjusted R-squared:  0.5389 
## F-statistic: 24.08 on 4 and 75 DF,  p-value: 7.764e-13

15. Save fitted values and calculate the residual for apartment ID2. (1 p)

mydata$StdFittedValues3 <- scale(fit3$fitted.values)

index_ID2 <- which(mydata$Age == 18 & mydata$Distance == 1 & mydata$Price == 2800 & mydata$Parking == 1 & mydata$Balcony == 0)

# Extract the data for apartment ID2
data_ID2 <- mydata[index_ID2, ]
fitted_value_ID2 <- predict(fit3, newdata = data_ID2)
residual_ID2 <- data_ID2$Price - fitted_value_ID2

print(paste("Fitted value for apartment ID2:", fitted_value_ID2))
## [1] "Fitted value for apartment ID2: 2356.59743503779"
print(paste("Residual for apartment ID2:", residual_ID2))
## [1] "Residual for apartment ID2: 443.402564962207"