Import the dataset Apartments.xlsx

library(readxl)

mydata4 <- read_xlsx("C:/Users/Korisnik/Desktop/domaci/Apartments.xlsx")
mydata4 <- as.data.frame(mydata4)

head(mydata4)
##   Age Distance Price Parking Balcony
## 1   7       28  1640       0       1
## 2  18        1  2800       1       0
## 3   7       28  1660       0       0
## 4  28       29  1850       0       1
## 5  18       18  1640       1       1
## 6  28       12  1770       0       1

Description:

Change categorical variables into factors.

mydata4$Parking <- as.factor(mydata4$Parking)
mydata4$Balcony <- as.factor(mydata4$Balcony)

str(mydata4)
## 'data.frame':    85 obs. of  5 variables:
##  $ Age     : num  7 18 7 28 18 28 14 18 22 25 ...
##  $ Distance: num  28 1 28 29 18 12 20 6 7 2 ...
##  $ Price   : num  1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
##  $ Parking : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 2 2 ...
##  $ Balcony : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 1 ...

Test the hypothesis H0: Mu_Price = 1900 eur. What can you conclude?

t_test_result <- t.test(mydata4$Price, mu = 1900)
t_test_result
## 
##  One Sample t-test
## 
## data:  mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
##  1937.443 2100.440
## sample estimates:
## mean of x 
##  2018.941
mean(mydata4$Price)
## [1] 2018.941

Estimate the simple regression function: Price = f(Age). Save results in object fit1 and explain the estimate of regression coefficient, coefficient of correlation and coefficient of determination.

fit1 <- lm(Price ~ Age, data = mydata4)
summary(fit1)
## 
## Call:
## lm(formula = Price ~ Age, data = mydata4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.9 -278.0  -69.8  243.5  776.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2185.455     87.043  25.108   <2e-16 ***
## Age           -8.975      4.164  -2.156    0.034 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared:  0.05302,    Adjusted R-squared:  0.04161 
## F-statistic: 4.647 on 1 and 83 DF,  p-value: 0.03401
# Correlation
cor(mydata4$Price, mydata4$Age)
## [1] -0.230255

Show the scateerplot matrix between Price, Age and Distance. Based on the matrix determine if there is potential problem with multicolinearity.

pairs(~ Price + Age + Distance, data = mydata4, main = "Scatterplot Matrix")

Estimate the multiple regression function: Price = f(Age, Distance). Save it in object named fit2.

fit2 <- lm(Price ~ Age + Distance, data = mydata4)
summary(fit2)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -603.23 -219.94  -85.68  211.31  689.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2460.101     76.632   32.10  < 2e-16 ***
## Age           -7.934      3.225   -2.46    0.016 *  
## Distance     -20.667      2.748   -7.52 6.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4259 
## F-statistic: 32.16 on 2 and 82 DF,  p-value: 4.896e-11

Check the multicolinearity with VIF statistics. Explain the findings.

library(car)
## Loading required package: carData
vif(fit2)
##      Age Distance 
## 1.001845 1.001845

Calculate standardized residuals and Cooks Distances for model fit2. Remove any potentially problematic units (outliers or units with high influence).

# Standardized residuals
std_resid <- rstandard(fit2)

# Cook's distance
cooks_d <- cooks.distance(fit2)

# Identify potential outliers
which(abs(std_resid) > 2)
## 33 38 53 
## 33 38 53
which(cooks_d > 4/length(cooks_d))
## 22 33 38 53 55 
## 22 33 38 53 55

Check for potential heteroskedasticity with scatterplot between standarized residuals and standrdized fitted values. Explain the findings.

std_fit <- scale(fitted(fit2))

plot(std_fit, std_resid,
     xlab = "Standardized Fitted Values",
     ylab = "Standardized Residuals",
     main = "Residuals vs Fitted")
abline(h = 0, col = "red")

Are standardized residuals ditributed normally? Show the graph and formally test it. Explain the findings.

# Histogram
hist(std_resid, main = "Histogram of Standardized Residuals", xlab = "Std Residuals")

The histogram suggests an approximately normal distribution of residuals.

# Q-Q plot
qqnorm(std_resid)
qqline(std_resid, col = "red")

The Q-Q plot also suggests an approximately normal distribution of residuals.

# Shapiro-Wilk test
shapiro.test(std_resid)
## 
##  Shapiro-Wilk normality test
## 
## data:  std_resid
## W = 0.95306, p-value = 0.00366

Estimate the fit2 again without potentially excluded units and show the summary of the model. Explain all coefficients.

# Remove problematic units (if any found earlier)
clean_data <- mydata4[abs(std_resid) <= 2 & cooks_d <= 4/length(cooks_d), ]

fit2_clean <- lm(Price ~ Age + Distance, data = clean_data)
summary(fit2_clean)
## 
## Call:
## lm(formula = Price ~ Age + Distance, data = clean_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -411.50 -203.69  -45.24  191.11  492.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2502.467     75.024  33.356  < 2e-16 ***
## Age           -8.674      3.221  -2.693  0.00869 ** 
## Distance     -24.063      2.692  -8.939 1.57e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared:  0.5361, Adjusted R-squared:  0.524 
## F-statistic: 44.49 on 2 and 77 DF,  p-value: 1.437e-13

Estimate the linear regression function Price = f(Age, Distance, Parking and Balcony). Be careful to correctly include categorical variables. Save the object named fit3.

fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = mydata4)
summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -459.92 -200.66  -57.48  260.08  594.37 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2301.667     94.271  24.415  < 2e-16 ***
## Age           -6.799      3.110  -2.186  0.03172 *  
## Distance     -18.045      2.758  -6.543 5.28e-09 ***
## Parking1     196.168     62.868   3.120  0.00251 ** 
## Balcony1       1.935     60.014   0.032  0.97436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared:  0.5004, Adjusted R-squared:  0.4754 
## F-statistic: 20.03 on 4 and 80 DF,  p-value: 1.849e-11

R² ≈ 0.84: the extended model explains slightly more variation in prices than fit2.

With function anova check if model fit3 fits data better than model fit2.

anova(fit2, fit3)
## Analysis of Variance Table
## 
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
##   Res.Df     RSS Df Sum of Sq      F  Pr(>F)  
## 1     82 6720983                              
## 2     80 5991088  2    729894 4.8732 0.01007 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Show the results of fit3 and explain regression coefficient for both categorical variables. Can you write down the hypothesis which is being tested with F-statistics, shown at the bottom of the output?

summary(fit3)
## 
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -459.92 -200.66  -57.48  260.08  594.37 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2301.667     94.271  24.415  < 2e-16 ***
## Age           -6.799      3.110  -2.186  0.03172 *  
## Distance     -18.045      2.758  -6.543 5.28e-09 ***
## Parking1     196.168     62.868   3.120  0.00251 ** 
## Balcony1       1.935     60.014   0.032  0.97436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared:  0.5004, Adjusted R-squared:  0.4754 
## F-statistic: 20.03 on 4 and 80 DF,  p-value: 1.849e-11

Save fitted values and claculate the residual for apartment ID2.

fitted_values <- fitted(fit3)
residuals_fit3 <- residuals(fit3)

fitted_values[2]
##        2 
## 2357.411
residuals_fit3[2]
##        2 
## 442.5889