library(readxl)
mydata <- read_excel("./Apartments.xlsx")
mydata <- cbind(ID = 1:nrow(mydata), mydata)
head(mydata)
## ID Age Distance Price Parking Balcony
## 1 1 7 28 1640 0 1
## 2 2 18 1 2800 1 0
## 3 3 7 28 1660 0 0
## 4 4 28 29 1850 0 1
## 5 5 18 18 1640 1 1
## 6 6 28 12 1770 0 1
Description:
mydata$ParkingF <- factor(mydata$Parking,
levels = c("0", "1"),
labels = c("No", "Yes"))
mydata$BalconyF <- factor(mydata$Balcony,
levels = c("0", "1"),
labels = c("No", "Yes"))
t.test(mydata$Price,
mu = 1900,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
fit1 <- lm(Price ~ Age,
data = mydata)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
library(car)
## 필요한 패키지를 로딩중입니다: carData
scatterplotMatrix(mydata[c(4, 2, 3)],
smooth = FALSE)
fit2 <- lm(Price ~ Age + Distance,
data = mydata)
vif(fit2)
## Age Distance
## 1.001845 1.001845
mean(vif(fit2))
## [1] 1.001845
mydata$StdResid <- round(rstandard(fit2), 3)
mydata$CooksD <- round(cooks.distance(fit2), 3)
hist(mydata$CooksD,
breaks = 20,
xlab = "Cooks Distance",
ylab = "Frequency",
main = "Histogram of Cooks distance")
head(mydata[order(-mydata$CooksD), ], 6)
## ID Age Distance Price Parking Balcony ParkingF BalconyF StdResid CooksD
## 38 38 5 45 2180 1 1 Yes Yes 2.577 0.320
## 55 55 43 37 1740 0 0 No No 1.445 0.104
## 33 33 2 11 2790 1 0 Yes No 2.051 0.069
## 53 53 7 2 1760 0 1 No Yes -2.152 0.066
## 22 22 37 3 2540 1 1 Yes Yes 1.576 0.061
## 39 39 40 2 2400 0 1 No Yes 1.091 0.038
mydataC <- mydata[mydata$CooksD < 0.04, ]
hist(mydataC$CooksD,
xlab = "Cooks Distance",
ylab = "Frequency",
main = "Histogram of Cooks distance")
fit2 <- lm(Price ~ Age + Distance,
data = mydataC)
mydataC$StdFitted <- scale(fit2$fitted.values)
library(car)
scatterplot(y = mydataC$StdResid, x = mydataC$StdFitted,
ylab = "Standardized Residuals",
xlab = "Standardized Fitted Values",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## 다음의 패키지를 부착합니다: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit2)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : Price
## Variables: fitted values of Price
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 1.738591
## Prob > Chi2 = 0.1873174
hist(mydataC$StdResid,
xlab = "Standardized Residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydataC$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydataC$StdResid
## W = 0.93418, p-value = 0.0004761
fit2 <- lm(Price ~ Age + Distance,
data = mydataC)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydataC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
fit3 <- lm(Price ~ Age + Distance + ParkingF + BalconyF,
data = mydataC)
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + ParkingF + BalconyF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 77 5077362
## 2 75 4791128 2 286234 2.2403 0.1135
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + ParkingF + BalconyF, data = mydataC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.93 -198.19 -53.64 186.73 518.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2393.316 93.930 25.480 < 2e-16 ***
## Age -7.970 3.191 -2.498 0.0147 *
## Distance -21.961 2.830 -7.762 3.39e-11 ***
## ParkingFYes 128.700 60.801 2.117 0.0376 *
## BalconyFYes 6.032 57.307 0.105 0.9165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 252.7 on 75 degrees of freedom
## Multiple R-squared: 0.5623, Adjusted R-squared: 0.5389
## F-statistic: 24.08 on 4 and 75 DF, p-value: 7.764e-13
Parking: Given the values of the other
explanatory variables, the group of apartments with parking space have
an average price per \(m^2\) that is
128.7EUR higher compared to the group of apartments without parking
space(\(p<0.04\))
Balcony: Didn’t find the difference between the
average price per \(m^2\) of two equal
apartments with exception that one apartment has a balcony and the other
one doesn’t. Given the values of the other explanatory variables, the
group of apartments that have a balcony have an average price per \(m^2\) that is 6.03EUR higher compared to
the group of apartments without a balcony(\(p<0.9)\), but it isn’t statistically
significant.
mydataC$FittedValues <- fit3$fitted.values
mydataC$Residuals <- resid(fit3)
head(mydataC[ , colnames(mydataC) %in% c("ID", "Price", "FittedValues", "Residuals")])
## ID Price FittedValues Residuals
## 1 1 1640 1728.641 -88.64095
## 2 2 2800 2356.597 443.40256
## 3 3 1660 1722.609 -62.60903
## 4 4 1850 1539.312 310.68782
## 5 5 1640 1989.286 -349.28625
## 6 6 1770 1912.655 -142.65528