Final Stat Project

# Load libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(scales)
library(caret)

## Loading required package: lattice

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor()     masks scales::col_factor()
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ purrr::discard()        masks scales::discard()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::lift()           masks caret::lift()
## ✖ ggplot2::margin()       masks randomForest::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

library(boot)

## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma

library(Metrics)  # for rmse

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

library(caret)    # for R2

Introduction

df <- read.csv('cleaned_apartments.csv')

head(df)

Cleaning data

Checking out all columns

names(df)

##  [1] "id"            "category"      "title"         "body"         
##  [5] "amenities"     "bathrooms"     "bedrooms"      "currency"     
##  [9] "fee"           "has_photo"     "pets_allowed"  "price"        
## [13] "price_display" "price_type"    "square_feet"   "address"      
## [17] "cityname"      "state"         "latitude"      "longitude"    
## [21] "source"        "time"

Out of 22 columns, I’m only going to use the below 10 columns:

Amenities
Bathrooms
bedrooms
fee
has_photo
pets_allowed
price
sq_feet
city
state

Value_counts() function

value_counts <- function(df, col_name) {
  result <- df |>
    group_by({{ col_name }}) |>
    summarise(n = n(), .groups = 'drop') |>
    arrange(desc(n))
  
  return(as.data.frame(result))  # Force data.frame output
}

Column amenities

value_counts(df, amenities)

Looks like there are multiple stuffs in the amenities. I want to see what are the different amenities available in the entire dataset.

split_values <- strsplit(df$amenities, split = ",")

all_values <- unlist(split_values)
all_values <- trimws(all_values)

value_counts_temp <- as.data.frame(table(all_values))
value_counts_temp <- value_counts_temp[order(-value_counts_temp$Freq), ]  # Sort descending

print(value_counts_temp)

##            all_values Freq
## 17            Parking 3727
## 6          Dishwasher 3266
## 20               Pool 3238
## 21       Refrigerator 3133
## 18         Patio/Deck 2472
## 4  Cable or Satellite 1678
## 22            Storage 1531
## 13                Gym 1469
## 15    Internet Access 1441
## 5           Clubhouse 1317
## 10   Garbage Disposal 1210
## 26       Washer Dryer 1077
## 9           Fireplace 1065
## 19         Playground  782
## 1                  AC  662
## 8            Elevator  642
## 23             Tennis  482
## 11              Gated  473
## 27        Wood Floors  357
## 14            Hot Tub  346
## 3          Basketball  318
## 24                 TV  207
## 25               View  149
## 7             Doorman   29
## 2               Alarm   23
## 12               Golf   23
## 16             Luxury   11

Looks like there are 27 different amenities available in our dataset. We can explode this into 27 columns. Creating 27 new binary column.

unique_values <- unique(all_values)

for (val in unique_values) {
  df[[val]] <- ifelse(grepl(val, df$amenities), "Yes", "No")
}

Column bathrooms

value_counts(df, bathrooms)

There are some null values in this column so replacing it with 1, since an apartment with zero bathroom doesn’t make sense.

df$bathrooms[is.na(df$bathrooms)] <- 1.0

Checking whether values are replaced.

value_counts(df, bathrooms)

ggplot(df, aes(x = as.factor(bathrooms), y = price)) +
  geom_boxplot() +
  labs(title = "Price vs No.of Bathrooms", x = "No.of bathrooms", y = "Price") +
  theme_minimal()

Column bedrooms

value_counts(df, bedrooms)

Replacing null values with 1.

df$bedrooms[is.na(df$bedrooms)] <- 1.0

ggplot(df, aes(x = as.factor(bedrooms), y = price)) +
  geom_boxplot() +
  labs(title = "Price vs No.of Bedrooms", x = "No.of Bedrooms", y = "Price") +
  theme_minimal()

Column fee

value_counts(df, fee)

ggplot(df, aes(x = fee, y = price)) +
  geom_boxplot() +
  labs(title = "Price vs Location", x = "Location", y = "Price") +
  theme_minimal()

Since this column only contains ‘No’ we can ignore this column.

Column has_photo

value_counts(df, has_photo)

ggplot(df, aes(x = has_photo, y = price)) +
  geom_boxplot() +
  labs(title = "Price vs Location", x = "Location", y = "Price") +
  theme_minimal()

Column pets_allowed

I’m going to split the

value_counts(df, pets_allowed)

ggplot(df, aes(x = pets_allowed, y = price)) +
  geom_boxplot() +
  labs(title = "Price vs Location", x = "Location", y = "Price") +
  theme_minimal()

unique_values <- c('Cats', 'Dogs')

for (val in unique_values) {
  df[[val]] <- ifelse(grepl(val, df$pets_allowed), "Yes", "No")
}

Column Square feet

df |>
  #filter(df$square_feet < 3000) |>
  ggplot(aes(x = square_feet, y = price)) +   
  geom_point(color = 'steelblue') +
  geom_smooth(method = "lm", se = TRUE, color = "black") +  # Regression line
  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
  labs(
    x = "Square Feet",
    y = "Price",
    title = "Square Feet Vs Price"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Looks like there are few outliers that might influence our model.

Column City Name

First, let’s fill the null rows with others and then inspect.

df$cityname[is.na(df$cityname) | df$cityname == ""] <- 'Others'

value_counts(df, cityname)

Since there are more than 1500 cities, I am going to reduce it to 3 categories. This way our model performs better.

Tire 1 (count > 50)
Tire 2 (count >=10 and < 50)
Tire 3 (count < 10)

city_counts <- df |>
  count(cityname)

city_counts <- city_counts |>
  mutate(
    city_tier = case_when(
      n > 50 ~ "Tier 1",
      n >= 10 & n < 50 ~ "Tier 2",
      n < 10 ~ "Tier 3"
    )
  )

df <- df |>
  left_join(city_counts |> select(cityname, city_tier), by = "cityname")

ggplot(df, aes(x = city_tier, y = price)) +
  geom_boxplot() +
  labs(title = "Price vs City", x = "City", y = "Price") +
  theme_minimal()

Creating different tires based on the mean rent.

Tire 1 (avg_rent > $3500)
Tire 2 (avg_rent >= $1500 & avg_rent <= $3500)
Tire 3 (avg_rent < $1500)

# Step 1: Calculate average rent per city
city_avg_rent <- df |>
  group_by(cityname) |>
  summarise(avg_rent = mean(price, na.rm = TRUE))

# Step 2: Assign tiers based on average rent
city_avg_rent <- city_avg_rent |>
  mutate(
    rent_tier = case_when(
      avg_rent > 3500 ~ "Tier 1",
      avg_rent >= 1500 & avg_rent <= 3500 ~ "Tier 2",
      avg_rent < 1500 ~ "Tier 3"
    )
  )

# Step 3: Join the rent tier back to the original dataframe
df <- df |>
  left_join(city_avg_rent %>% select(cityname, rent_tier), by = "cityname")

ggplot(df, aes(x = rent_tier, y = price)) +
  geom_boxplot() +
  labs(title = "Price vs City", x = "City", y = "Price") +
  theme_minimal()

value_counts(df, rent_tier)

value_counts(df, city_tier)

We will also target encode city and state. This will be useful in Linear regression technique.

target_encode <- function(df, cat_cols, target_col) {
  for (col in cat_cols) {
    means <- aggregate(df[[target_col]], by = list(df[[col]]), FUN = mean, na.rm = TRUE)
    names(means) <- c(col, paste0(col, "_te"))
    df <- merge(df, means, by = col, all.x = TRUE)
    #df[[col]] <- NULL  # optionally drop original column
  }
  return(df)
}

df <- target_encode(df, "cityname", "price")
df <- target_encode(df, "state", "price")

We will also scale the values using min max scaler.

min_max_scale <- function(column) {
  scaled <- (column - min(column, na.rm = TRUE)) / 
            (max(column, na.rm = TRUE) - min(column, na.rm = TRUE))
  return(scaled)
}

Building Model

columns_to_ignore <- c(
  "id", "category", "title", "body", "fee", "currency", 
  "price_display", "price_type", "state", "address", 
  "latitude", "longitude", "source", "time", 'amenities',
  "has_photo", 'cityname', 'pets_allowed', 'city_tier', "cityname_te", "state_te"
)

df_new <- df[, !(names(df) %in% columns_to_ignore)]

Decision Tree

# Make sure 'price' is numeric
df_new$price <- as.numeric(df_new$price)

# Split into train and test sets
set.seed(1)
train_idx <- sample(seq_len(nrow(df_new)), size = 0.7 * nrow(df_new))
train_data <- df_new[train_idx, ]
test_data <- df_new[-train_idx, ]

# Fit a regression tree (since price is numeric)
tree_model <- rpart(price ~ ., data = train_data, method = "anova")

# Plot the tree
rpart.plot(tree_model)

# Predict on test data
pred <- predict(tree_model, newdata = test_data)

# Calculate RMSE
rmse_val <- rmse(test_data$price, pred)

# Calculate R²
r2_val <- R2(pred, test_data$price)

# Print results
cat("RMSE:", rmse_val, "\n")

## RMSE: 699.5394

cat("R_squared:", r2_val, "\n")

## R_squared: 0.4516386

Random Forest

names(df_new) <- make.names(names(df_new))

# Train-test split (70% train, 30% test)
set.seed(123)
train_idx <- sample(seq_len(nrow(df_new)), size = 0.7 * nrow(df_new))
train_data <- df_new[train_idx, ]
test_data <- df_new[-train_idx, ]

rf_model <- randomForest(
  price ~ ., 
  data = train_data, 
  ntree = 500,          # Number of trees
  mtry = floor(sqrt(ncol(train_data) - 1)),  # Number of variables tried at each split
  importance = TRUE
)

# View model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = price ~ ., data = train_data, ntree = 500,      mtry = floor(sqrt(ncol(train_data) - 1)), importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 746867.8
##                     % Var explained: 39.88

# Variable Importance Plot
varImpPlot(rf_model)

# Predict on test data
predictions <- predict(rf_model, newdata = test_data)

# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((predictions - test_data$price)^2))
print(paste("Test RMSE:", round(rmse, 2)))

## [1] "Test RMSE: 623.42"

# Create tuning grid
tune_grid <- expand.grid(mtry = seq(2, sqrt(ncol(train_data)-1)*2, by = 1))  # trying different mtry values

# Cross-validation settings
train_control <- trainControl(
  method = "cv",        # Cross Validation
  number = 5,           # 5-fold CV
  verboseIter = TRUE    # Show progress
)

# Train Random Forest with tuning
set.seed(123)
rf_tuned <- train(
  price ~ ., 
  data = train_data,
  method = "rf",
  tuneGrid = tune_grid,
  trControl = train_control,
  ntree = 500,
  importance = TRUE
)

## + Fold1: mtry= 2 
## - Fold1: mtry= 2 
## + Fold1: mtry= 3 
## - Fold1: mtry= 3 
## + Fold1: mtry= 4 
## - Fold1: mtry= 4 
## + Fold1: mtry= 5 
## - Fold1: mtry= 5 
## + Fold1: mtry= 6 
## - Fold1: mtry= 6 
## + Fold1: mtry= 7 
## - Fold1: mtry= 7 
## + Fold1: mtry= 8 
## - Fold1: mtry= 8 
## + Fold1: mtry= 9 
## - Fold1: mtry= 9 
## + Fold1: mtry=10 
## - Fold1: mtry=10 
## + Fold1: mtry=11 
## - Fold1: mtry=11 
## + Fold2: mtry= 2 
## - Fold2: mtry= 2 
## + Fold2: mtry= 3 
## - Fold2: mtry= 3 
## + Fold2: mtry= 4 
## - Fold2: mtry= 4 
## + Fold2: mtry= 5 
## - Fold2: mtry= 5 
## + Fold2: mtry= 6 
## - Fold2: mtry= 6 
## + Fold2: mtry= 7 
## - Fold2: mtry= 7 
## + Fold2: mtry= 8 
## - Fold2: mtry= 8 
## + Fold2: mtry= 9 
## - Fold2: mtry= 9 
## + Fold2: mtry=10 
## - Fold2: mtry=10 
## + Fold2: mtry=11 
## - Fold2: mtry=11 
## + Fold3: mtry= 2 
## - Fold3: mtry= 2 
## + Fold3: mtry= 3 
## - Fold3: mtry= 3 
## + Fold3: mtry= 4 
## - Fold3: mtry= 4 
## + Fold3: mtry= 5 
## - Fold3: mtry= 5 
## + Fold3: mtry= 6 
## - Fold3: mtry= 6 
## + Fold3: mtry= 7 
## - Fold3: mtry= 7 
## + Fold3: mtry= 8 
## - Fold3: mtry= 8 
## + Fold3: mtry= 9 
## - Fold3: mtry= 9 
## + Fold3: mtry=10 
## - Fold3: mtry=10 
## + Fold3: mtry=11 
## - Fold3: mtry=11 
## + Fold4: mtry= 2 
## - Fold4: mtry= 2 
## + Fold4: mtry= 3 
## - Fold4: mtry= 3 
## + Fold4: mtry= 4 
## - Fold4: mtry= 4 
## + Fold4: mtry= 5 
## - Fold4: mtry= 5 
## + Fold4: mtry= 6 
## - Fold4: mtry= 6 
## + Fold4: mtry= 7 
## - Fold4: mtry= 7 
## + Fold4: mtry= 8 
## - Fold4: mtry= 8 
## + Fold4: mtry= 9 
## - Fold4: mtry= 9 
## + Fold4: mtry=10 
## - Fold4: mtry=10 
## + Fold4: mtry=11 
## - Fold4: mtry=11 
## + Fold5: mtry= 2 
## - Fold5: mtry= 2 
## + Fold5: mtry= 3 
## - Fold5: mtry= 3 
## + Fold5: mtry= 4 
## - Fold5: mtry= 4 
## + Fold5: mtry= 5 
## - Fold5: mtry= 5 
## + Fold5: mtry= 6 
## - Fold5: mtry= 6 
## + Fold5: mtry= 7 
## - Fold5: mtry= 7 
## + Fold5: mtry= 8 
## - Fold5: mtry= 8 
## + Fold5: mtry= 9 
## - Fold5: mtry= 9 
## + Fold5: mtry=10 
## - Fold5: mtry=10 
## + Fold5: mtry=11 
## - Fold5: mtry=11 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 7 on full training set

# Best tuning parameter
print(rf_tuned$bestTune)

##   mtry
## 6    7

# Full results
print(rf_tuned)

## Random Forest 
## 
## 7000 samples
##   33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5600, 5600, 5600, 5600, 5600 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    862.2460  0.4305304  398.4646
##    3    813.7618  0.4719169  368.1542
##    4    794.0612  0.4891121  356.4038
##    5    784.7608  0.4966200  350.7674
##    6    775.5746  0.5067759  347.2245
##    7    774.5047  0.5069861  346.9900
##    8    780.3102  0.4995471  348.5823
##    9    778.6647  0.5007279  348.4353
##   10    777.7674  0.5019659  348.9894
##   11    782.2437  0.4975384  350.9046
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 7.

# Plot performance vs. mtry
plot(rf_tuned)

actual <- test_data$price
predicted <- predict(rf_model, newdata = test_data)

mae <- mean(abs(actual - predicted))
mse <- mean((actual - predicted)^2)
rmse <- sqrt(mse)

rss <- sum((actual - predicted)^2)
tss <- sum((actual - mean(actual))^2)
r_squared <- 1 - rss/tss


cat("MAE:", mae, "\n")

## MAE: 347.5443

cat("MSE:", mse, "\n")

## MSE: 388648.7

cat("RMSE:", rmse, "\n")

## RMSE: 623.4169

cat("R-squared:", r_squared, "\n")

## R-squared: 0.596768

Linear Regression

Functions to plot VIF and coefficients

plot_vif <- function(model, threshold = 5) {
  require(car)
  require(ggplot2)
  
  vif_values <- vif(model)
  
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
  
  ggplot(vif_df) +
    geom_bar(mapping = aes(x = VIF, y = Variable), stat = "identity", fill = "steelblue") +
    geom_vline(xintercept = threshold, linetype = "dashed", color = "red", linewidth = 1) +
    labs(title = "VIF Values", x = "VIF", y = "Variable") +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 0),
      axis.text.y = element_text(angle = 0)
    )
}

plot_coefficients <- function(model) {
  require(ggplot2)
  
  coef_df <- data.frame(
    Predictor = names(coef(model))[-1],
    Coefficient = coef(model)[-1]
  )
  
  ggplot(coef_df, aes(x = reorder(Predictor, Coefficient), y = Coefficient)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(
      title = "Linear Regression Coefficients",
      x = "Predictors",
      y = "Coefficient Value"
    ) +
    theme_minimal()
}

First we will get the required columns in a new dataframe just for linear regression.

df_lr <- df |>
  select(
    bathrooms,
    bedrooms,
    square_feet,
    Cats,
    Dogs,
    Dishwasher,
    Elevator,
    `Patio/Deck`,
    Pool,
    Storage,
    Refrigerator,
    AC,
    Basketball,
    `Cable or Satellite`,
    Gym,
    `Internet Access`,
    Clubhouse,
    Parking,
    `Garbage Disposal`,
    Fireplace,
    `Washer Dryer`,
    Playground,
    Gated,
    TV,
    `Hot Tub`,
    Tennis, 
    `Wood Floors`,
    View, 
    Alarm,
    Doorman, 
    Luxury, 
    Golf,
    cityname_te,
    state_te,
    price
    
  )

df_lr

# List of columns to exclude
exclude_cols <- c("bathrooms", "bedrooms", "square_feet", "cityname_te", "state_te", "price")
cols_to_convert <- setdiff(names(df_lr), exclude_cols)

df_lr[cols_to_convert] <- lapply(df_lr[cols_to_convert], function(x) ifelse(x == "Yes", 1, 0))

df_lr$scaled_price <- min_max_scale(df_lr$price)
df_lr$scaled_city <- min_max_scale(df_lr$cityname_te)
df_lr$scaled_state <- min_max_scale(df_lr$state_te)
df_lr$scaled_sq_feet <- min_max_scale(df_lr$square_feet)

Train Test split

set.seed(42)  # For reproducibility
train_indices <- createDataPartition(df_lr$price, p = 0.8, list = FALSE)

train_set <- df_lr[train_indices, ]
test_set <- df_lr[-train_indices, ]

Model 1 with all predictors

model1 <- lm(price ~ cityname_te+state_te+square_feet+bedrooms+bathrooms+Dishwasher + Elevator + `Patio/Deck` + Pool + Storage + Refrigerator + AC + Basketball + `Cable or Satellite` + Gym + `Internet Access` + Clubhouse + Parking + `Garbage Disposal` + Fireplace + `Washer Dryer` + Playground + Gated + TV + `Hot Tub` + Tennis + `Wood Floors` + View + Alarm + Doorman + Luxury + Golf + Cats + Dogs
, data = train_set)

summary(model1)

## 
## Call:
## lm(formula = price ~ cityname_te + state_te + square_feet + bedrooms + 
##     bathrooms + Dishwasher + Elevator + `Patio/Deck` + Pool + 
##     Storage + Refrigerator + AC + Basketball + `Cable or Satellite` + 
##     Gym + `Internet Access` + Clubhouse + Parking + `Garbage Disposal` + 
##     Fireplace + `Washer Dryer` + Playground + Gated + TV + `Hot Tub` + 
##     Tennis + `Wood Floors` + View + Alarm + Doorman + Luxury + 
##     Golf + Cats + Dogs, data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6252.2  -234.1   -13.4   185.9 14185.4 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.043e+02  2.636e+01 -30.516  < 2e-16 ***
## cityname_te           9.010e-01  8.587e-03 104.930  < 2e-16 ***
## state_te              1.198e-01  1.429e-02   8.381  < 2e-16 ***
## square_feet           1.984e-01  1.175e-02  16.882  < 2e-16 ***
## bedrooms              7.428e+01  9.587e+00   7.748 1.05e-14 ***
## bathrooms             3.176e+02  1.514e+01  20.981  < 2e-16 ***
## Dishwasher            4.188e+01  1.964e+01   2.132 0.033034 *  
## Elevator              1.360e+02  2.667e+01   5.097 3.53e-07 ***
## `Patio/Deck`          3.859e+00  1.778e+01   0.217 0.828223    
## Pool                  2.308e+01  1.625e+01   1.420 0.155692    
## Storage               2.328e+01  1.885e+01   1.235 0.216746    
## Refrigerator         -1.957e+01  2.093e+01  -0.935 0.349817    
## AC                   -4.886e+00  2.819e+01  -0.173 0.862394    
## Basketball           -6.540e+01  3.691e+01  -1.772 0.076444 .  
## `Cable or Satellite` -5.712e+01  2.410e+01  -2.370 0.017791 *  
## Gym                   2.352e+00  2.225e+01   0.106 0.915833    
## `Internet Access`     3.825e+01  2.332e+01   1.640 0.100980    
## Clubhouse             9.360e-01  2.206e+01   0.042 0.966151    
## Parking               5.825e+01  1.503e+01   3.876 0.000107 ***
## `Garbage Disposal`   -9.007e+01  2.384e+01  -3.778 0.000159 ***
## Fireplace            -3.886e+01  2.193e+01  -1.772 0.076487 .  
## `Washer Dryer`       -9.379e+00  2.398e+01  -0.391 0.695745    
## Playground           -8.943e+01  2.527e+01  -3.539 0.000404 ***
## Gated                -8.179e+01  3.011e+01  -2.716 0.006622 ** 
## TV                    5.591e+01  4.376e+01   1.278 0.201407    
## `Hot Tub`             1.190e+00  3.514e+01   0.034 0.972982    
## Tennis               -1.552e+00  3.136e+01  -0.049 0.960544    
## `Wood Floors`         7.003e+01  3.356e+01   2.087 0.036939 *  
## View                  6.654e+01  5.106e+01   1.303 0.192483    
## Alarm                -1.937e+01  1.191e+02  -0.163 0.870793    
## Doorman              -3.850e+02  1.192e+02  -3.229 0.001247 ** 
## Luxury                1.422e+02  1.716e+02   0.829 0.407348    
## Golf                  1.829e+02  1.417e+02   1.291 0.196823    
## Cats                 -4.412e+01  2.664e+01  -1.656 0.097667 .  
## Dogs                  3.554e+01  2.632e+01   1.350 0.177093    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 541.2 on 7966 degrees of freedom
## Multiple R-squared:  0.7515, Adjusted R-squared:  0.7504 
## F-statistic: 708.4 on 34 and 7966 DF,  p-value: < 2.2e-16

From the above summary we can see that the below predictors are insignificant, since their p-value is less than threshold.

Patio/Deck, Pool, Storage, Refrigerator, AC, Basketball, Gym, Internet Access, Clubhouse, TV, Hot Tub, Tennis, View, Alarm, Luxury, Golf, Cats, Dogs.

plot_coefficients(model1)

plot_vif(model1)

# mean squared error
mse <- mean(model1$residuals ^ 2)

# root mean squared error
rmse <- sqrt(mse)

cat(mse, rmse)

## 291606.9 540.0064

summary(model1)$r.squared

## [1] 0.7514725

Model 2 without insignificant predictors

model2 <- lm(price ~ cityname_te + state_te + square_feet + bedrooms + bathrooms +
             Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
             Playground + Gated + `Wood Floors` + Doorman,
             data = train_set)

summary(model2)

## 
## Call:
## lm(formula = price ~ cityname_te + state_te + square_feet + bedrooms + 
##     bathrooms + Dishwasher + Elevator + `Cable or Satellite` + 
##     Parking + `Garbage Disposal` + Playground + Gated + `Wood Floors` + 
##     Doorman, data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6223.4  -236.7   -10.4   184.5 14189.1 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -8.040e+02  2.459e+01 -32.696  < 2e-16 ***
## cityname_te           9.023e-01  8.554e-03 105.483  < 2e-16 ***
## state_te              1.180e-01  1.415e-02   8.333  < 2e-16 ***
## square_feet           1.965e-01  1.171e-02  16.782  < 2e-16 ***
## bedrooms              7.202e+01  9.469e+00   7.606 3.15e-14 ***
## bathrooms             3.199e+02  1.499e+01  21.339  < 2e-16 ***
## Dishwasher            3.110e+01  1.597e+01   1.947 0.051561 .  
## Elevator              1.446e+02  2.620e+01   5.517 3.55e-08 ***
## `Cable or Satellite` -4.520e+01  2.011e+01  -2.248 0.024628 *  
## Parking               6.778e+01  1.421e+01   4.770 1.87e-06 ***
## `Garbage Disposal`   -8.381e+01  2.256e+01  -3.715 0.000205 ***
## Playground           -8.928e+01  2.320e+01  -3.849 0.000119 ***
## Gated                -7.626e+01  2.901e+01  -2.628 0.008598 ** 
## `Wood Floors`         7.528e+01  3.264e+01   2.306 0.021109 *  
## Doorman              -3.509e+02  1.182e+02  -2.969 0.003001 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 541.3 on 7986 degrees of freedom
## Multiple R-squared:  0.7507, Adjusted R-squared:  0.7503 
## F-statistic:  1718 on 14 and 7986 DF,  p-value: < 2.2e-16

plot_coefficients(model2)

plot_vif(model2)

# mean squared error
mse <- mean(model2$residuals ^ 2)
# root mean squared error
rmse <- sqrt(mse)

cat(mse, rmse)

## 292498.5 540.8313

summary(model2)$r.squared

## [1] 0.7507127

Model 3 with scaled values

model3 <- lm(scaled_price ~ scaled_city+scaled_state+scaled_sq_feet+bedrooms+bathrooms +
            Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
            Playground + Gated + `Wood Floors` + Doorman, data = train_set)

summary(model3)

## 
## Call:
## lm(formula = scaled_price ~ scaled_city + scaled_state + scaled_sq_feet + 
##     bedrooms + bathrooms + Dishwasher + Elevator + `Cable or Satellite` + 
##     Parking + `Garbage Disposal` + Playground + Gated + `Wood Floors` + 
##     Doorman, data = train_set)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.118995 -0.004527 -0.000199  0.003528  0.271303 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.0137581  0.0003782 -36.382  < 2e-16 ***
## scaled_city           0.9023487  0.0085545 105.483  < 2e-16 ***
## scaled_state          0.0067120  0.0008055   8.333  < 2e-16 ***
## scaled_sq_feet        0.1498797  0.0089312  16.782  < 2e-16 ***
## bedrooms              0.0013771  0.0001810   7.606 3.15e-14 ***
## bathrooms             0.0061171  0.0002867  21.339  < 2e-16 ***
## Dishwasher            0.0005946  0.0003054   1.947 0.051561 .  
## Elevator              0.0027639  0.0005009   5.517 3.55e-08 ***
## `Cable or Satellite` -0.0008642  0.0003845  -2.248 0.024628 *  
## Parking               0.0012961  0.0002717   4.770 1.87e-06 ***
## `Garbage Disposal`   -0.0016026  0.0004314  -3.715 0.000205 ***
## Playground           -0.0017071  0.0004435  -3.849 0.000119 ***
## Gated                -0.0014581  0.0005548  -2.628 0.008598 ** 
## `Wood Floors`         0.0014393  0.0006240   2.306 0.021109 *  
## Doorman              -0.0067095  0.0022602  -2.969 0.003001 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01035 on 7986 degrees of freedom
## Multiple R-squared:  0.7507, Adjusted R-squared:  0.7503 
## F-statistic:  1718 on 14 and 7986 DF,  p-value: < 2.2e-16

plot_coefficients(model3)

plot_vif(model3)

# mean squared error
mse <- mean(model3$residuals ^ 2)
# root mean squared error
rmse <- sqrt(mse)

cat(mse, rmse)

## 0.0001069351 0.01034094

summary(model3)$r.squared

## [1] 0.7507127

K-fold cross validation

library(caret)

# Define cross-validation method: 5-fold CV (you can change to 10, etc.)
train_control <- trainControl(method = "cv", number = 5)

# Fit the model with train()
cv_model <- train(
  price ~ cityname_te + state_te + square_feet + bedrooms + bathrooms +
    Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
    Playground + Gated + `Wood Floors` + Doorman,
  data = train_set,
  method = "lm",
  trControl = train_control
)

print(cv_model)

## Linear Regression 
## 
## 8001 samples
##   14 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6401, 6399, 6402, 6402, 6400 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   591.2377  0.6714998  312.0822
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

library(caret)

# Define cross-validation method: 5-fold CV (you can change to 10, etc.)
train_control <- trainControl(method = "cv", number = 5)

# Fit the model with train()
cv_model <- train(
  scaled_price ~ scaled_city+scaled_state+scaled_sq_feet + bedrooms + bathrooms +
    Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
    Playground + Gated + `Wood Floors` + Doorman,
  data = train_set,
  method = "lm",
  trControl = train_control
)

print(cv_model)

## Linear Regression 
## 
## 8001 samples
##   14 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6400, 6401, 6401, 6401, 6401 
## Resampling results:
## 
##   RMSE        Rsquared   MAE        
##   0.01148586  0.6690222  0.005966564
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Final Project

Vigneshwar

2025-04-19