Part I

Part II

# Import Data
house <- read.csv("/Users/pin.lyu/Desktop/BC_Class_Folder/Econometrics/DIS_&_ASSIGNMENT/DIS_4/Housing/train.csv") 
# Checking N/A in data set 
colSums(is.na(house))
##           beds          baths           size     size_units       lot_size 
##              0              0              0              0            347 
## lot_size_units       zip_code          price 
##              0              0              0
# Eliminate rows that have N/A 
house2 <- na.omit(house)

colSums(is.na(house2))
##           beds          baths           size     size_units       lot_size 
##              0              0              0              0              0 
## lot_size_units       zip_code          price 
##              0              0              0
# Outlier check
boxplot(house2$price,
  ylab = "price"
)

summary(house2$price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   159000   680000   865000  1038475  1175000 25000000

Now it looks like we have outlier that is affecting our data, so next we are going to eliminate the outliers in price.

# Outlier elimination 
quartiles <- quantile(house2$price, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(house2$price)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(house2, house2$price > Lower & house2$price < Upper)

dim(data_no_outlier)
## [1] 1565    8

Outliers are now excluded, now let’s see if it worked.

# New boxplot without outliers
boxplot(data_no_outlier$price,
  ylab = "price"
)

my_reg <- lm( price ~ beds + baths + size + lot_size, 
    data = house2
)

summary(my_reg)$coefficient
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)  292277.95998 69475.131333  4.2069436 2.726411e-05
## beds        -125889.21034 29279.047806 -4.2996347 1.810042e-05
## baths         71176.85676 30474.626101  2.3356105 1.962983e-02
## size            510.01587    38.657235 13.1932837 7.119295e-38
## lot_size          2.69381     8.966492  0.3004308 7.638861e-01

Part III

plot(my_reg)