library(readxl) #Activating library
mydata <- read_xlsx("./Apartments.xlsx") #Reading data set in .xlsx format
mydata <- as.data.frame(mydata) #Convert data table to data frame format
head(mydata)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata$Parking <- factor(mydata$Parking,
levels = c(0, 1),
labels = c("No", "Yes"))
mydata$Balcony <- factor(mydata$Balcony,
levels = c(0, 1),
labels = c("No", "Yes"))
head(mydata)
## Age Distance Price Parking Balcony
## 1 7 28 1640 No Yes
## 2 18 1 2800 Yes No
## 3 7 28 1660 No No
## 4 28 29 1850 No Yes
## 5 18 18 1640 Yes Yes
## 6 28 12 1770 No Yes
t.test(mydata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
Hypotheses for one-sample t-test:
Null hypothesis (H0): μ = 1900
Alternative hypothesis (H1): μ ≠ 1900
We reject the null hypothesis at the p = 0.05, meaning that there is statistically significant evidence that the average apartment price differs from 1900 EUR
fit1 <- lm(Price ~ Age, data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
The estimate of regression coefficient
Coefficient of determination
Coefficient of correlation
cor(mydata$Price, mydata$Age)
## [1] -0.230255
#install.packages("carData")
library(car)
## Loading required package: carData
scatterplotMatrix(mydata[ , -c(4, 5)],
smooth = FALSE,
main = "Scatterplot Matrix: Price, Age, and Distance")
fit2 <- lm(Price ~ Age + Distance, data = mydata)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
mydata$std_resid <- rstandard(fit2)
mydata$cooks_d <- cooks.distance(fit2)
hist(mydata$std_resid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydata$std_resid)
##
## Shapiro-Wilk normality test
##
## data: mydata$std_resid
## W = 0.95306, p-value = 0.00366
hist(mydata$cooks_d,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata[order(-mydata$cooks_d),], 6)
## Age Distance Price Parking Balcony std_resid cooks_d
## 38 5 45 2180 Yes Yes 2.576772 0.31973058
## 55 43 37 1740 No No 1.444768 0.10420445
## 33 2 11 2790 Yes No 2.050586 0.06913379
## 53 7 2 1760 No Yes -2.151787 0.06625775
## 22 37 3 2540 Yes Yes 1.575982 0.06086868
## 39 40 2 2400 No Yes 1.091176 0.03750987
mydata <- mydata[order(-mydata$cooks_d)[-1], ] # [-1] removes the first index (i.e., the most influential unit)
head(mydata[order(-mydata$cooks_d),], 6)
## Age Distance Price Parking Balcony std_resid cooks_d
## 55 43 37 1740 No No 1.444768 0.10420445
## 33 2 11 2790 Yes No 2.050586 0.06913379
## 53 7 2 1760 No Yes -2.151787 0.06625775
## 22 37 3 2540 Yes Yes 1.575982 0.06086868
## 39 40 2 2400 No Yes 1.091176 0.03750987
## 58 8 2 2820 Yes No 1.655343 0.03654413
hist(mydata$cooks_d,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
mydata <- mydata[order(-mydata$cooks_d)[-c(1, 2, 3, 4)], ]
hist(mydata$cooks_d,
xlab = "Cooks distance",
ylab = "Frequency",
main = "Histogram of Cooks distances")
head(mydata[order(-mydata$cooks_d),], 6)
## Age Distance Price Parking Balcony std_resid cooks_d
## 39 40 2 2400 No Yes 1.091176 0.03750987
## 58 8 2 2820 Yes No 1.655343 0.03654413
## 25 8 26 2300 Yes Yes 1.570985 0.03412790
## 57 10 1 2810 No No 1.600668 0.03199717
## 2 18 1 2800 Yes No 1.783288 0.03036543
## 61 18 1 2800 Yes Yes 1.783288 0.03036543
fit2 <- lm(Price ~ Age + Distance, data = mydata)
mydata$std_fit <- scale(fit2$fitted.values)
library(car)
scatterplot(std_resid ~ std_fit,
data = mydata,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Scatterplot: Residuals vs Fitted Values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE,
grid = FALSE)
# install.packages("olsrr")
library(ggpubr)
## Loading required package: ggplot2
ggqqplot(rstandard(fit2))
shapiro.test(rstandard(fit2))
##
## Shapiro-Wilk normality test
##
## data: rstandard(fit2)
## W = 0.94156, p-value = 0.001168
fit2_clean <- lm(Price ~ Age + Distance, data = mydata)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
Intercept (b0)
Age - Regression Coefficient (b1)
Distance - Regression Coefficient (b1)
Coefficient of determination
F-statistic
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = mydata)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingYes 128.700 60.801 2.117 0.0376 *
## BalconyYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
Interpretation of Categorical Coefficients:
ParkingYes (b3):
BalconyYes (b4):
F-statistic Hypothesis Test
Null hypothesis (H0): All regression coefficients are zero (i.e.,
the model does not explain anything)
Alternative hypothesis (H1): At least one explanatory variable in the model significantly explains variation in the dependent variable
Since the p-value is extremely small, we reject the null hypothesis. At least one explanatory variable in the model significantly explains variation in the dependent variable