#Research question: How does area and nearness to mainroad affect apartmant price?

# Read the CSV file
mydata <- read.table("~/NADJA ROGANOVIC/housing.csv", header=TRUE, sep=",", dec=".")

#Excluding unimportant variables
mydata<- mydata[, -c(3,4,5,7,8,9,10,11,12,13)]

head(mydata)
##      price area mainroad
## 1 13300000 7420      yes
## 2 12250000 8960      yes
## 3 12250000 9960      yes
## 4 12215000 7500      yes
## 5 11410000 7420      yes
## 6 10850000 7500      yes

Explanation of a dataset:

Explanation of the variables in the data set:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata <- mydata %>%
  mutate(ID = row_number())

mydata <- mydata %>%
  select(ID, everything())

head(mydata,6) #Creating a new variable ID
##   ID    price area mainroad
## 1  1 13300000 7420      yes
## 2  2 12250000 8960      yes
## 3  3 12250000 9960      yes
## 4  4 12215000 7500      yes
## 5  5 11410000 7420      yes
## 6  6 10850000 7500      yes
library(tidyr)
mydata <- drop_na(mydata) #Removing units with missing values

*There is no missing data

*Data manipulation

mydata$mainroadF <- factor(mydata$mainroad,
                        levels = c("yes", "no"),
                        labels = c("yes","no"))

head(mydata)
##   ID    price area mainroad mainroadF
## 1  1 13300000 7420      yes       yes
## 2  2 12250000 8960      yes       yes
## 3  3 12250000 9960      yes       yes
## 4  4 12215000 7500      yes       yes
## 5  5 11410000 7420      yes       yes
## 6  6 10850000 7500      yes       yes
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first, last
summary(mydata[,c(-1,-4)]) #Descriptive statistics (excluding categorical data)
##      price               area       mainroadF
##  Min.   : 1750000   Min.   : 1650   yes:468  
##  1st Qu.: 3430000   1st Qu.: 3600   no : 77  
##  Median : 4340000   Median : 4600            
##  Mean   : 4766729   Mean   : 5151            
##  3rd Qu.: 5740000   3rd Qu.: 6360            
##  Max.   :13300000   Max.   :16200

##Regession

###Assumptions:

####Additional requirements * The dependent variable is numerical, the explanatory variables can be numerical or dichotomous (i.e., Dummy variable)

###Defining Regression model

Estimated linear regression model: price = b0 + b1* area + b2* mainroadF

library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
scatterplotMatrix(mydata[ , c(-1, -4,-5)],  #Excluding ID and categorical variables
                  smooth = FALSE) 

*From this scatter plot we can conclude that there is linearity between price and area.

cor(mydata$price, mydata$area) #Correlation matrix
## [1] 0.5359973
fit_yes <- lm(price ~ area, 
                 data = mydata[mydata$mainroadF == "yes", ])

summary(fit_yes)
## 
## Call:
## lm(formula = price ~ area, data = mydata[mydata$mainroadF == 
##     "yes", ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4767310 -1004121  -208106   734596  7432619 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.644e+06  2.018e+05   13.10   <2e-16 ***
## area        4.344e+02  3.461e+01   12.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1639000 on 466 degrees of freedom
## Multiple R-squared:  0.2527, Adjusted R-squared:  0.251 
## F-statistic: 157.5 on 1 and 466 DF,  p-value: < 2.2e-16
fit_no <- lm(price ~ area, 
                 data = mydata[mydata$mainroadF == "no", ])

summary(fit_no)
## 
## Call:
## lm(formula = price ~ area, data = mydata[mydata$mainroadF == 
##     "no", ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1516097  -566079  -152579   438198  2246638 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.711e+06  3.201e+05   8.469 1.53e-12 ***
## area        1.907e+02  8.439e+01   2.260   0.0267 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 871500 on 75 degrees of freedom
## Multiple R-squared:  0.06374,    Adjusted R-squared:  0.05126 
## F-statistic: 5.106 on 1 and 75 DF,  p-value: 0.02674
fit1 <- lm(price ~ area + mainroadF, 
                 data = mydata)

summary(fit1)
## 
## Call:
## lm(formula = price ~ area + mainroadF, data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4683840  -987688  -204714   701328  7454932 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2703560.89  188019.74  14.379  < 2e-16 ***
## area            423.38      32.14  13.174  < 2e-16 ***
## mainroadFno -831565.14  200048.72  -4.157 3.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1557000 on 542 degrees of freedom
## Multiple R-squared:  0.3093, Adjusted R-squared:  0.3068 
## F-statistic: 121.4 on 2 and 542 DF,  p-value: < 2.2e-16
vif(fit1) #Multicolinearity
##      area mainroadF 
##  1.091046  1.091046
mean(vif(fit1))
## [1] 1.091046
mydata$stdResid <- round(rstandard(fit1), 3) #standardized residuals 
mydata$CooksD <- round(cooks.distance(fit1), 3) #Cooks distances

hist (mydata$stdResid,
      xlab = "Standardized residuals",
      ylab = "Frequency",
      main = "Histogram of standardized residuals")

shapiro.test(mydata$stdResid) #Checking normality 
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$stdResid
## W = 0.95166, p-value = 2.305e-12

We reject the null hypothesis (p<0.001) and conclude that errors are not normally distributed.(However, this doesn’t matter since the sample size is sufficiently big)

hist(mydata$CooksD,
     xlab = "CooksD",
     ylab = "Frequency",
     main = "Histogram of Cooks distances",
     col = "lightblue")

* Based on histogram, we can conclude that there are some gaps. This will be solved by removing some units of high impact

head(mydata[order(mydata$stdResid), c("ID","stdResid")],10) #Highest values of standardized residuals
##      ID stdResid
## 404 404   -3.048
## 521 521   -2.261
## 126 126   -2.213
## 453 453   -2.168
## 212 212   -2.125
## 402 402   -2.081
## 474 474   -2.001
## 461 461   -1.961
## 495 495   -1.834
## 278 278   -1.800
head(mydata[order(-mydata$stdResid), c("ID", "stdResid")], 10)  #Lowest values of standardized residuals
##    ID stdResid
## 1   1    4.796
## 4   4    4.077
## 2   2    3.708
## 5   5    3.580
## 3   3    3.441
## 14 14    3.252
## 6   6    3.198
## 10 10    2.997
## 12 12    2.852
## 19 19    2.725
mydata <- mydata %>%
  filter(!ID %in% c(404,1,4,2,5,3,14,6)) #Removing outliers
head(mydata[order(-mydata$CooksD), c("ID", "CooksD")], 10) #Six units with highest value of Cooks distance
##      ID CooksD
## 119 126  0.079
## 205 212  0.040
## 180 187  0.015
## 271 278  0.014
## 395 402  0.014
## 1     7  0.013
## 445 453  0.012
## 151 158  0.011
## 3     9  0.010
## 185 192  0.010
library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(126,212)) #Removing units of high impact

*Now we will check if we removed all outliers and units of high impact

fit1 <- lm(price ~ area + mainroadF,  
           data = mydata)

mydata$stdResid <- round(rstandard(fit1), 3)
mydata$CooksD <- round(cooks.distance(fit1),3)
hist(mydata$stdResid,
     xlab= "Standard residuals",
     ylab = "Frequency",
     main= "Histogram of standardized residuals")

head(mydata[order(mydata$stdResid), c("ID","stdResid")],10) #Highest values of standardized residuals
##      ID stdResid
## 511 521   -2.491
## 443 453   -2.390
## 393 402   -2.294
## 464 474   -2.201
## 451 461   -2.156
## 485 495   -2.011
## 269 278   -1.981
## 535 545   -1.810
## 474 484   -1.803
## 346 355   -1.749
head(mydata[order(-mydata$stdResid), c("ID", "stdResid")], 10)  #Lowest values of standardized residuals
##    ID stdResid
## 4  10    3.399
## 6  12    3.237
## 12 19    3.097
## 14 21    3.083
## 17 24    2.933
## 9  16    2.819
## 7  13    2.802
## 1   7    2.789
## 3   9    2.733
## 10 17    2.635
mydata <- mydata %>%
  filter(!ID %in% c(10,12,19,21)) #Removing outliers

*Checking again

hist(mydata$stdResid,
     xlab= "Standard residuals",
     ylab = "Frequency",
     main= "Histogram of standardized residuals") #Testing once again

* Outliers removed

head(mydata[order(-mydata$CooksD), c("ID", "CooksD")], 6) #Six units with highest value of Cooks distance
##      ID CooksD
## 175 187  0.020
## 1     7  0.019
## 265 278  0.019
## 389 402  0.019
## 439 453  0.017
## 3     9  0.015
library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(7,278,402,187,9,158)) #Removing units of high impact
hist(mydata$CooksD,
     xlab = "CooksD",
     ylab = "Frequency",
     main = "Histogram of Cooks distances",
     col = "lightblue")

fit <- lm (price ~ area + mainroadF, 
           data = mydata)

mydata$stdFitted <- scale(fit$fitted.values)

library(car)
scatterplot(stdResid ~ stdFitted,
            ylab = "Standardized residuals",
            xlab = "Standardized fitted values",
            smooth = FALSE,
            regLine = FALSE,
            boxplots = FALSE,
            data=mydata)

library(olsrr)
## Warning: package 'olsrr' was built under R version 4.3.2
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(fit1)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data                
##  ---------------------------------
##  Response : price 
##  Variables: fitted values of price 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    50.55006 
##  Prob > Chi2   =    1.161634e-12

*We reject null hypothesis at p<0.001

library(estimatr)
## Warning: package 'estimatr' was built under R version 4.3.2
fit1 <- lm_robust(price ~ area + mainroadF,  
                  se_type = "HC1",
                  data = mydata)


summary(fit1)
## 
## Call:
## lm_robust(formula = price ~ area + mainroadF, data = mydata, 
##     se_type = "HC1")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|) CI Lower  CI Upper  DF
## (Intercept) 2526546.6  150325.69  16.807 5.510e-51  2231229 2821864.3 522
## area            437.6      30.36  14.415 6.805e-40      378     497.2 522
## mainroadFno -737959.2  120592.24  -6.119 1.845e-09  -974865 -501053.4 522
## 
## Multiple R-squared:  0.3669 ,    Adjusted R-squared:  0.3645 
## F-statistic: 152.6 on 2 and 522 DF,  p-value: < 2.2e-16

####Area

####MainroadFno

####Multiple R-squared *33.72% of variability of price is explained by variability of area and whether apartment is located on main road

####F-statistic

####Example of calculating price price = 2526546.6 + 437.6 * area - 762525.5 * mainroadF

price = 2526546.6 + 437.6 * 4600 - 762525.5 * 0 price = 45395066

sqrt(summary(fit1)$r.squared) #Coefficient of multiple correlation
## [1] 0.6057096