library(readxl)
apartments <- read_excel("Apartments.xlsx")
head(apartments)
## # A tibble: 6 × 5
## Age Distance Price Parking Balcony
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
apartments$ParkingF <- factor(apartments$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
apartments$BalconyF <- factor(apartments$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
t.test(apartments$Price, mu = 1900)
##
## One Sample t-test
##
## data: apartments$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age, data = apartments)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
library(car)
## Loading required package: carData
scatterplotMatrix(apartments[c("Price", "Age", "Distance")], smooth = FALSE)
# - The relationship between Age and Distance, as well as Age and price
appears positve and weak # - The relationship between Distance appears
negative and realtivley weak # - Based on the percived weakness of all
relationships, problems with multicolinearity appear unlikley
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
vif(fit2)
## Age Distance
## 1.001845 1.001845
print(mean(vif(fit2)))
## [1] 1.001845
apartments$StdResid <- round(rstandard(fit2), 3)
apartments$CooksD <- round(cooks.distance(fit2),3)
apartments$StdResid <- round(rstandard(fit2), 3)
apartments$CooksD <- round(cooks.distance(fit2),3)
hist(apartments$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
hist(apartments$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(apartments[order(apartments$StdResid),], 3)
## # A tibble: 3 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 7 2 1760 0 1 No Yes -2.15 0.066
## 2 12 14 1650 0 1 No Yes -1.50 0.013
## 3 12 14 1650 0 0 No No -1.50 0.013
head(apartments[order(-apartments$StdResid),], 3)
## # A tibble: 3 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 5 45 2180 1 1 Yes Yes 2.58 0.32
## 2 2 11 2790 1 0 Yes No 2.05 0.069
## 3 18 1 2800 1 0 Yes No 1.78 0.03
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
apartments <- apartments %>%
filter (!StdResid %in% c(-2.152, 2.577))
head(apartments[order(-apartments$CooksD),], 6)
## # A tibble: 6 × 9
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl>
## 1 43 37 1740 0 0 No No 1.44 0.104
## 2 2 11 2790 1 0 Yes No 2.05 0.069
## 3 37 3 2540 1 1 Yes Yes 1.58 0.061
## 4 40 2 2400 0 1 No Yes 1.09 0.038
## 5 8 2 2820 1 0 Yes No 1.66 0.037
## 6 8 26 2300 1 1 Yes Yes 1.57 0.034
library(dplyr)
apartments <- apartments %>%
filter (!CooksD %in% c(0.104))
nrow(apartments)
## [1] 82
fit2 <- lm(Price ~ Age + Distance, data = apartments)
apartments$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = apartments$StdResid, x= apartments$StdFitted,
ylab= "Standarized Residuals",
xlab= "Standarized Fitted Values",
boxplots = FALSE,
regLine= FALSE,
smooth = FALSE)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.814365
## Prob > Chi2 = 0.1779855
hist(apartments$StdResid,
xlab = "Standardised residuals",
ylab = "Frequency",
main = "Frequency of standardised residuals")
shapiro.test(apartments$StdResid)
##
## Shapiro-Wilk normality test
##
## data: apartments$StdResid
## W = 0.93423, p-value = 0.0004015
fit2 <- lm(Price ~ Age + Distance, data = apartments)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -424.84 -215.45 -44.84 213.26 552.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2527.549 75.591 33.437 < 2e-16 ***
## Age -8.876 3.179 -2.792 0.00657 **
## Distance -24.728 2.763 -8.949 1.23e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 265.5 on 79 degrees of freedom
## Multiple R-squared: 0.5287, Adjusted R-squared: 0.5168
## F-statistic: 44.31 on 2 and 79 DF, p-value: 1.245e-13
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF, data = apartments)
anova(fit3, fit2)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance + ParkingF + BalconyF
## Model 2: Price ~ Age + Distance
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5186289
## 2 79 5570638 -2 -384349 2.8532 0.06377 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = apartments)
##
## Residuals:
## Min 1Q Median 3Q Max
## -412.48 -200.10 -56.06 232.11 512.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2400.760 94.216 25.482 < 2e-16 ***
## Age -8.125 3.123 -2.602 0.0111 *
## Distance -22.221 2.897 -7.669 4.4e-11 ***
## ParkingFYes 147.825 61.883 2.389 0.0194 *
## BalconyFYes 6.400 58.014 0.110 0.9124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 259.5 on 77 degrees of freedom
## Multiple R-squared: 0.5612, Adjusted R-squared: 0.5384
## F-statistic: 24.62 on 4 and 77 DF, p-value: 3.81e-13
apartments$fitted_values <- fitted(fit3)
apartments$residuals <- apartments$Price - apartments$fitted_values
apartments[2, c("fitted_values", "residuals")]
## # A tibble: 1 × 2
## fitted_values residuals
## <dbl> <dbl>
## 1 2380. 420.