library(readxl)
mydata <- read_xlsx("./Apartments.xlsx")
mydata <- as.data.frame(mydata)
Description:
Do the age of an apartment in years, the distance of the apartment from the city center in km, whether the apartment has parking and whether the apartment has a balcony influence the price of the apartment?
mydata$ParkingF <- factor(mydata$Parking,
levels = c("0", "1"),
labels = c("No", "Yes"))
mydata$BalconyF <- factor(mydata$Balcony,
levels = c("0", "1"),
labels = c("No", "Yes"))
library(ggplot2)
ggplot(mydata, aes(x = Price)) +
geom_histogram(binwidth = 80, colour = "black") +
ylab("Frequency") +
xlab("Price per m2 of the Apartments in EUR")
shapiro.test(mydata$Price)
##
## Shapiro-Wilk normality test
##
## data: mydata$Price
## W = 0.94017, p-value = 0.0006513
wilcox.test(mydata$Price,
mu = 1900,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Price
## V = 2328, p-value = 0.02828
## alternative hypothesis: true location is not equal to 1900
library(effectsize)
effectsize(wilcox.test(mydata$Price,
mu = 1900,
correct = FALSE))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.27 | [0.04, 0.48]
##
## - Deviation from a difference of 1900.
interpret_rank_biserial(0.27, "funder2019")
## [1] "medium"
## (Rules: funder2019)
median(mydata$Price)
## [1] 1950
fit1 <- lm(Price ~ Age,
data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
cor(mydata$Price, mydata$Age)
## [1] -0.230255
Regression Coefficient: If the age of an apartment increases by one year, then the price per m2 of the apartment is predicted to decrease by 8.976 EUR on average.
Coefficient of Correlation: The Coefficient of Correlation of -0.230255 between the price per m2 of an apartment and the age of the apartment indicates a weak negative linear relationship, suggesting that as the age of an apartment increases, the price per m2 of the apartment tends to slightly decrease.
Coefficient of Determination: 5.302% of the variability in the price per m2 of the apartments can be explained by the linear effects of the age of the apartments.
library(car)
## 载入需要的程序包:carData
scatterplotMatrix(mydata[, c(-4, -5, -6, -7)],
smooth = FALSE)
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2)
## Age Distance
## 1.001845 1.001845
mydata$StdResid <- round(rstandard(fit2), 3)
mydata$CooksD <- round(cooks.distance(fit2), 3)
head(mydata[order(mydata$StdResid), ], 3)
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 13 12 14 1650 0 1 No Yes -1.499 0.013
## 72 12 14 1650 0 0 No No -1.499 0.013
head(mydata[order(-mydata$CooksD), ], 6)
## Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## 38 5 45 2180 1 1 Yes Yes 2.577 0.320
## 55 43 37 1740 0 0 No No 1.445 0.104
## 33 2 11 2790 1 0 Yes No 2.051 0.069
## 53 7 2 1760 0 1 No Yes -2.152 0.066
## 22 37 3 2540 1 1 Yes Yes 1.576 0.061
## 39 40 2 2400 0 1 No Yes 1.091 0.038
hist(mydata$CooksD,
xlab = "Cook's Distance",
ylab = "Frequency",
main = "Histogram of Cook's Distances")
library(dplyr)
##
## 载入程序包:'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata2 <- mydata %>%
filter(!CooksD > 0.3)
fit2_new <- lm(Price ~ Age + Distance,
data = mydata2)
summary(fit2_new)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
library(car)
mydata2$StdFitted <- scale(fit2_new$fitted.values)
mydata2$StdResid <- round(rstandard(fit2_new), 3)
mydata2$CooksD <- round(cooks.distance(fit2_new), 3)
scatterplot(y = mydata2$StdResid, x = mydata2$StdFitted,
ylab = "Standardised Residuals",
xlab = "Standardised Fitted Values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## 载入程序包:'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2_new)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 2.927455
## Prob > Chi2 = 0.08708469
hist(mydata2$StdResid,
xlab = "Standardised Residuals",
ylab = "Frequency",
main = "Histogram of Standardised Residuals")
shapiro.test(mydata2$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata2$StdResid
## W = 0.95649, p-value = 0.006355
fit2_new <- lm(Price ~ Age + Distance,
data = mydata2)
summary(fit2_new)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -604.92 -229.63 -56.49 192.97 599.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2456.076 73.931 33.221 < 2e-16 ***
## Age -6.464 3.159 -2.046 0.044 *
## Distance -22.955 2.786 -8.240 2.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 81 degrees of freedom
## Multiple R-squared: 0.4838, Adjusted R-squared: 0.4711
## F-statistic: 37.96 on 2 and 81 DF, p-value: 2.339e-12
Age: Holding all other explanatory variables constant, if the age of an apartment increases by one year, then the price per m2 of the apartment is predicted to decrease by 6.464 EUR on average (p < 0.05).
Distance: Holding all other explanatory variables constant, if the distance of an apartment from city center increases by 1km, then the price per m2 of the apartment is predicted to decrease by 22.955 EUR on average (p < 0.01).
Coefficient of Determination: 48.38% of the variability in the price per m2 of the apartments can be explained by the linear effects of the age of the apartments and the distance of the apartment from city center.
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata2)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
anova(fit2_new, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 81 6176767
## 2 79 5654480 2 522287 3.6485 0.03051 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473.21 -192.37 -28.89 204.17 558.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2329.724 93.066 25.033 < 2e-16 ***
## Age -5.821 3.074 -1.894 0.06190 .
## Distance -20.279 2.886 -7.026 6.66e-10 ***
## ParkingFYes 167.531 62.864 2.665 0.00933 **
## BalconyFYes -15.207 59.201 -0.257 0.79795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 267.5 on 79 degrees of freedom
## Multiple R-squared: 0.5275, Adjusted R-squared: 0.5035
## F-statistic: 22.04 on 4 and 79 DF, p-value: 3.018e-12
ParkingF: Holding all other explanatory variables constant, apartments that have parking are predicted to have a price per m2 that is higher by 167.531 EUR on average than apartments that do not have parking (p < 0.01).
BalconyF: Holding all other explanatory variables constant, apartments that have a balcony are predicted to have a price per m2 that is lower by 15.207 EUR on average than apartments that do not have a balcony.
The F-statistic at the bottom of the regression output is testing the overall significance of the regression model.
Null Hypothesis: All regression coefficients (except the intercept) are equal to zero.
Alternative Hypothesis: At least one of the coefficients is not equal to zero.
There is strong evidence that at least one of the explanatory variables contributes significantly to predicting Price. So the model is statistically significant overall.
mydata2$Fitted <- fitted.values(fit3)
mydata2$Residuals <- residuals(fit3)
head(mydata2[colnames(mydata2) %in% c("Price", "Fitted", "Residuals")])
## Price Fitted Residuals
## 1 1640 1705.952 -65.95206
## 2 2800 2372.197 427.80292
## 3 1660 1721.159 -61.15894
## 4 1850 1563.431 286.56890
## 5 1640 2012.244 -372.24396
## 6 1770 1908.177 -138.17733