library(readxl)
mydata <- read_xlsx("./Apartments.xlsx")
mydata <- as.data.frame(mydata)
head(mydata)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
Does the age of an apartment in years affect its price?
mydata$ParkingF <- factor(mydata$Parking,
levels = c(0, 1),
labels = c("No Parking", "Parking"))
mydata$BalconyF <- factor(mydata$Balcony,
levels = c(0, 1),
labels = c("No Balcony", "Balcony"))
library(psych)
describe(mydata$Price)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 85 2018.94 377.84 1950 1990.29 429.95 1400 2820 1420 0.54 -0.69 40.98
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(mydata, aes(x = Price)) +
geom_histogram(binwidth = 250, colour = "black") +
ylab("EUR") +
xlab("Price of Apartments")
shapiro.test(mydata$Price)
##
## Shapiro-Wilk normality test
##
## data: mydata$Price
## W = 0.94017, p-value = 0.0006513
H0: Price of apartments is normally distributed
H1: Price of apartments is not normally distributed
median(mydata$Price)
## [1] 1950
wilcox.test(mydata$Price,
mu = 1900,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Price
## V = 2328, p-value = 0.02828
## alternative hypothesis: true location is not equal to 1900
H0: Median of apartment price is equal to 1900 euro
H1: Median of apartment price is not equal to 1900 euro
library(psych)
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
effectsize(wilcox.test(mydata$Price,
mu = 1900,
correct = FALSE))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.27 | [0.04, 0.48]
##
## - Deviation from a difference of 1900.
Based on the sample data, we found that the median price of apartments is not equal to 1900 euro (p = 0.03, r = 0.27 - medium effect). The median was 1950 euro.
fit1 <- lm(Price ~ Age,
data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(mydata[c("Price", "Age", "Distance")], smooth = FALSE)
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mydata$StdResid <- round(rstandard(fit2), 3)
mydata$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata$StdResid
## W = 0.95303, p-value = 0.003645
H0: error is normally distributed H1: error is not normally distributed
hist(mydata$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
mydata$ID <- seq(1, nrow(mydata))
head(mydata[order(mydata$StdResid), c("ID", "StdResid")], 3)
## ID StdResid
## 53 53 -2.152
## 13 13 -1.499
## 72 72 -1.499
head(mydata[order(-mydata$StdResid), c("ID", "StdResid")], 3)
## ID StdResid
## 38 38 2.577
## 33 33 2.051
## 2 2 1.783
head(mydata[order(-mydata$CooksD), c("ID", "CooksD")], 6)
## ID CooksD
## 38 38 0.320
## 55 55 0.104
## 33 33 0.069
## 53 53 0.066
## 22 22 0.061
## 39 39 0.038
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
mydata <- mydata %>%
filter(!ID %in% c(38, 55))
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
mydata$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = mydata$StdResid, x = mydata$StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata$StdResid
## W = 0.94963, p-value = 0.002636
H0: error is normally distributed H1: error is not normally distributed
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -627.27 -212.96 -46.23 205.05 578.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2490.112 76.189 32.684 < 2e-16 ***
## Age -7.850 3.244 -2.420 0.0178 *
## Distance -23.945 2.826 -8.473 9.53e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.5 on 80 degrees of freedom
## Multiple R-squared: 0.4968, Adjusted R-squared: 0.4842
## F-statistic: 39.49 on 2 and 80 DF, p-value: 1.173e-12
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydata)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.06 -194.33 -32.04 219.03 544.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2358.900 93.664 25.185 < 2e-16 ***
## Age -7.197 3.148 -2.286 0.02499 *
## Distance -21.241 2.911 -7.296 2.14e-10 ***
## ParkingFParking 168.921 62.166 2.717 0.00811 **
## BalconyFBalcony -6.985 58.745 -0.119 0.90566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.5 on 78 degrees of freedom
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5173
## F-statistic: 22.97 on 4 and 78 DF, p-value: 1.449e-12
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -499.06 -194.33 -32.04 219.03 544.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2358.900 93.664 25.185 < 2e-16 ***
## Age -7.197 3.148 -2.286 0.02499 *
## Distance -21.241 2.911 -7.296 2.14e-10 ***
## ParkingFParking 168.921 62.166 2.717 0.00811 **
## BalconyFBalcony -6.985 58.745 -0.119 0.90566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.5 on 78 degrees of freedom
## Multiple R-squared: 0.5408, Adjusted R-squared: 0.5173
## F-statistic: 22.97 on 4 and 78 DF, p-value: 1.449e-12