#### Import the dataset Apartments.xlsx
``` r
library(readxl)
Apartments <- read_excel("Apartments.xlsx")
head(Apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
#Show first few rows of the data to verify head(Apartments)
Description:
Apartments$Parking <- factor(Apartments$Parking, levels = c(0, 1), labels = c("No", "Yes"))
Apartments$Balcony <- factor(Apartments$Balcony, levels = c(0, 1), labels = c("No", "Yes"))
str(Apartments)
## tibble [85 × 5] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:85] 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num [1:85] 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num [1:85] 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 1 ...
t_test_result <- t.test(Apartments$Price, mu = 1900)
t_test_result
##
## One Sample t-test
##
## data: Apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age, data = Apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
pairs(~ Price + Age + Distance, data = Apartments,
main = "Scatterplot Matrix")
fit2 <- lm(Price ~ Age + Distance, data = Apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
standardized_residuals <- rstandard(fit2)
cooks_d <- cooks.distance(fit2)
influential_points <- which(cooks_d > (4/nrow(Apartments)))
influential_points
## 22 33 38 53 55
## 22 33 38 53 55
Apartments_clean <- Apartments[-influential_points, ]
standardized_fitted_values <- rstandard(fit2)
plot(standardized_fitted_values, standardized_residuals,
main = "Standardized Residuals vs Fitted Values",
xlab = "Standardized Fitted Values", ylab = "Standardized Residuals")
abline(h = 0, col = "red")
qqnorm(standardized_residuals)
qqline(standardized_residuals, col = "red")
shapiro.test(standardized_residuals)
##
## Shapiro-Wilk normality test
##
## data: standardized_residuals
## W = 0.95306, p-value = 0.00366
fit2_clean <- lm(Price ~ Age + Distance, data = Apartments_clean)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = Apartments_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = Apartments)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = Apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## ParkingYes 196.168 62.868 3.120 0.00251 **
## BalconyYes 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
fitted_values <- fitted(fit3)
residuals <- resid(fit3)
fitted_value_ID2 <- fitted_values[2]
residual_ID2 <- residuals[2]
fitted_value_ID2
## 2
## 2357.411
residual_ID2
## 2
## 442.5889