1. Descriptive Analysis on Variables

Housing.5 <- read.csv("~/Downloads/Housing 5.csv")
# Organizing Data 
Price <- Housing.5$price
Area <- Housing.5$area
Bedrooms <- Housing.5$bedrooms
Bathrooms <- Housing.5$bathrooms
Stories <- Housing.5$stories
Mainroad <- ifelse(Housing.5$mainroad=="yes",1,0)
Guestroom <- ifelse(Housing.5$guestroom=="yes",1,0)
Basement <- ifelse(Housing.5$basement=="yes",1,0)
Hot_Water_Heating <- ifelse(Housing.5$hotwaterheating=="yes",1,0)
Air_Conditioning <- ifelse(Housing.5$airconditioning=="yes",1,0)
Parking <- Housing.5$parking
Preferred_Area <- ifelse(Housing.5$prefarea=="yes",1,0)

Quantitative Variables:

Price

Histogram

hist(Price/1000, xlab = "Price in Thousands $")

COMMENT:

Box Plot

boxplot(Price/1000, xlab = "Price in Thousands $")

COMMENT:

Area

Scatter-Plot

plot(Area, Price/1000, xlab = "Area", ylab = "Price in Thousands $")

COMMENT:

Box-Plot

boxplot(Area, xlab = "Area")

COMMENT:

Bedrooms

Histogram

hist(Bedrooms, xlab = "Number of Bedrooms")

COMMENT:

Bathrooms

hist(Bathrooms, xlab = "Number of Bathrooms")

COMMENT:

Stories

hist(Stories, xlab = "Number of Stories")

COMMENT:

Parking

hist(Parking, breaks = 2)

COMMENT:

Overall

analysis_1 <- Housing.5[, c("bedrooms", "bathrooms", "stories", "parking")]
boxplot(analysis_1)

Categorical Variables

Mainroad

hist(Mainroad, breaks = 2)

COMMENT:

Guestroom

hist(Guestroom, breaks = 2)

COMMENT:

Basement

hist(Basement, breaks = 2)

COMMENT:

Hot Water Heater

hist(Hot_Water_Heating, breaks = 2)

COMMENT:

Air Conditioning

hist(Air_Conditioning, breaks = 2)

COMMENT:

Preferred Area

hist(Preferred_Area, breaks = 2)

COMMENT:

2. Basic Linear Regression Model

reg.mod <- lm(price~., data = Housing.5)
summary(reg.mod)
## 
## Call:
## lm(formula = price ~ ., data = Housing.5)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2619718  -657322   -68409   507176  5166695 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      42771.69  264313.31   0.162 0.871508    
## area                               244.14      24.29  10.052  < 2e-16 ***
## bedrooms                        114787.56   72598.66   1.581 0.114445    
## bathrooms                       987668.11  103361.98   9.555  < 2e-16 ***
## stories                         450848.00   64168.93   7.026 6.55e-12 ***
## mainroadyes                     421272.59  142224.13   2.962 0.003193 ** 
## guestroomyes                    300525.86  131710.22   2.282 0.022901 *  
## basementyes                     350106.90  110284.06   3.175 0.001587 ** 
## hotwaterheatingyes              855447.15  223152.69   3.833 0.000141 ***
## airconditioningyes              864958.31  108354.51   7.983 8.91e-15 ***
## parking                         277107.10   58525.89   4.735 2.82e-06 ***
## prefareayes                     651543.80  115682.34   5.632 2.89e-08 ***
## furnishingstatussemi-furnished  -46344.62  116574.09  -0.398 0.691118    
## furnishingstatusunfurnished    -411234.39  126210.56  -3.258 0.001192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1068000 on 531 degrees of freedom
## Multiple R-squared:  0.6818, Adjusted R-squared:  0.674 
## F-statistic: 87.52 on 13 and 531 DF,  p-value: < 2.2e-16

COMMENT:

3. Identifying Outliers

As per #1, there are many outliers in the price and area variables, and a few in the parking and bathroom

# Remove Outliers in Price
Price_Q1 <- quantile(Price, 0.25)
Price_Q3 <- quantile(Price, 0.75)
Price_IQR <- IQR(Price)
Housing.5 <- subset(Housing.5, Price>(Price_Q1-1.5*Price_IQR) & Price<(Price_Q3+1.5*Price_IQR))

# Remove Outliers in Area
Area_Q1 <- quantile(Area, 0.25)
Area_Q3 <- quantile(Area, 0.75)
Area_IQR <- IQR(Area)
Housing.5 <- subset(Housing.5, Area>(Area_Q1-1.5*Area_IQR) & Area<(Area_Q3+1.5*Area_IQR))

# Remove Outliers in Parking
Parking_Q1 <- quantile(Parking, 0.25)
Parking_Q3 <- quantile(Parking, 0.75)
Parking_IQR <- IQR(Parking)
Housing.5 <- subset(Housing.5, Parking>(Parking_Q1-1.5*Parking_IQR) & Parking<(Parking_Q3+1.5*Parking_IQR))

# Remove Outliers in Bathroom
Bathroom_Q1 <- quantile(Bathrooms, 0.25)
Bathroom_Q3 <- quantile(Bathrooms, 0.75)
Bathroom_IQR <- IQR(Bathrooms)
Housing.5 <- subset(Housing.5, Bathrooms>(Bathroom_Q1-1.5*Bathroom_IQR) & Bathrooms<(Bathroom_Q3+1.5*Bathroom_IQR))

4. Predictors Selection (Mallows CP and Boruta Algorithm)

Mallows CP