library(readxl)
mydata4 <- read_xlsx("C:/Users/Korisnik/Desktop/domaci/Apartments.xlsx")
mydata4 <- as.data.frame(mydata4)
head(mydata4)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata4$Parking <- as.factor(mydata4$Parking)
mydata4$Balcony <- as.factor(mydata4$Balcony)
str(mydata4)
## 'data.frame': 85 obs. of 5 variables:
## $ Age : num 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 1 ...
t_test_result <- t.test(mydata4$Price, mu = 1900)
t_test_result
##
## One Sample t-test
##
## data: mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
mean(mydata4$Price)
## [1] 2018.941
fit1 <- lm(Price ~ Age, data = mydata4)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
# Correlation
cor(mydata4$Price, mydata4$Age)
## [1] -0.230255
pairs(~ Price + Age + Distance, data = mydata4, main = "Scatterplot Matrix")
fit2 <- lm(Price ~ Age + Distance, data = mydata4)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
## Loading required package: carData
vif(fit2)
## Age Distance
## 1.001845 1.001845
# Standardized residuals
std_resid <- rstandard(fit2)
# Cook's distance
cooks_d <- cooks.distance(fit2)
# Identify potential outliers
which(abs(std_resid) > 2)
## 33 38 53
## 33 38 53
which(cooks_d > 4/length(cooks_d))
## 22 33 38 53 55
## 22 33 38 53 55
std_fit <- scale(fitted(fit2))
plot(std_fit, std_resid,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")
# Histogram
hist(std_resid, main = "Histogram of Standardized Residuals", xlab = "Std Residuals")
The histogram suggests an approximately normal distribution of
residuals.
# Q-Q plot
qqnorm(std_resid)
qqline(std_resid, col = "red")
The Q-Q plot also suggests an approximately normal distribution of
residuals.
# Shapiro-Wilk test
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.95306, p-value = 0.00366
# Remove problematic units (if any found earlier)
clean_data <- mydata4[abs(std_resid) <= 2 & cooks_d <= 4/length(cooks_d), ]
fit2_clean <- lm(Price ~ Age + Distance, data = clean_data)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = mydata4)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
R² ≈ 0.84: the extended model explains slightly more variation in prices than fit2.
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
fitted_values <- fitted(fit3)
residuals_fit3 <- residuals(fit3)
fitted_values[2]
## 2
## 2357.411
residuals_fit3[2]
## 2
## 442.5889