# Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(scales)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:lattice':
##
## melanoma
library(Metrics) # for rmse
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(caret) # for R2
df <- read.csv('cleaned_apartments.csv')
head(df)
names(df)
## [1] "id" "category" "title" "body"
## [5] "amenities" "bathrooms" "bedrooms" "currency"
## [9] "fee" "has_photo" "pets_allowed" "price"
## [13] "price_display" "price_type" "square_feet" "address"
## [17] "cityname" "state" "latitude" "longitude"
## [21] "source" "time"
Out of 22 columns, I’m only going to use the below 10 columns:
Amenities
Bathrooms
bedrooms
fee
has_photo
pets_allowed
price
sq_feet
city
state
value_counts <- function(df, col_name) {
result <- df |>
group_by({{ col_name }}) |>
summarise(n = n(), .groups = 'drop') |>
arrange(desc(n))
return(as.data.frame(result)) # Force data.frame output
}
value_counts(df, amenities)
Looks like there are multiple stuffs in the amenities. I want to see what are the different amenities available in the entire dataset.
split_values <- strsplit(df$amenities, split = ",")
all_values <- unlist(split_values)
all_values <- trimws(all_values)
value_counts_temp <- as.data.frame(table(all_values))
value_counts_temp <- value_counts_temp[order(-value_counts_temp$Freq), ] # Sort descending
print(value_counts_temp)
## all_values Freq
## 17 Parking 3727
## 6 Dishwasher 3266
## 20 Pool 3238
## 21 Refrigerator 3133
## 18 Patio/Deck 2472
## 4 Cable or Satellite 1678
## 22 Storage 1531
## 13 Gym 1469
## 15 Internet Access 1441
## 5 Clubhouse 1317
## 10 Garbage Disposal 1210
## 26 Washer Dryer 1077
## 9 Fireplace 1065
## 19 Playground 782
## 1 AC 662
## 8 Elevator 642
## 23 Tennis 482
## 11 Gated 473
## 27 Wood Floors 357
## 14 Hot Tub 346
## 3 Basketball 318
## 24 TV 207
## 25 View 149
## 7 Doorman 29
## 2 Alarm 23
## 12 Golf 23
## 16 Luxury 11
Looks like there are 27 different amenities available in our dataset. We can explode this into 27 columns. Creating 27 new binary column.
unique_values <- unique(all_values)
for (val in unique_values) {
df[[val]] <- ifelse(grepl(val, df$amenities), "Yes", "No")
}
value_counts(df, bathrooms)
There are some null values in this column so replacing it with 1, since an apartment with zero bathroom doesn’t make sense.
df$bathrooms[is.na(df$bathrooms)] <- 1.0
Checking whether values are replaced.
value_counts(df, bathrooms)
ggplot(df, aes(x = as.factor(bathrooms), y = price)) +
geom_boxplot() +
labs(title = "Price vs No.of Bathrooms", x = "No.of bathrooms", y = "Price") +
theme_minimal()
value_counts(df, bedrooms)
Replacing null values with 1.
df$bedrooms[is.na(df$bedrooms)] <- 1.0
ggplot(df, aes(x = as.factor(bedrooms), y = price)) +
geom_boxplot() +
labs(title = "Price vs No.of Bedrooms", x = "No.of Bedrooms", y = "Price") +
theme_minimal()
value_counts(df, fee)
ggplot(df, aes(x = fee, y = price)) +
geom_boxplot() +
labs(title = "Price vs Location", x = "Location", y = "Price") +
theme_minimal()
Since this column only contains ‘No’ we can ignore this column.
value_counts(df, has_photo)
ggplot(df, aes(x = has_photo, y = price)) +
geom_boxplot() +
labs(title = "Price vs Location", x = "Location", y = "Price") +
theme_minimal()
I’m going to split the
value_counts(df, pets_allowed)
ggplot(df, aes(x = pets_allowed, y = price)) +
geom_boxplot() +
labs(title = "Price vs Location", x = "Location", y = "Price") +
theme_minimal()
unique_values <- c('Cats', 'Dogs')
for (val in unique_values) {
df[[val]] <- ifelse(grepl(val, df$pets_allowed), "Yes", "No")
}
df |>
#filter(df$square_feet < 3000) |>
ggplot(aes(x = square_feet, y = price)) +
geom_point(color = 'steelblue') +
geom_smooth(method = "lm", se = TRUE, color = "black") + # Regression line
scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
labs(
x = "Square Feet",
y = "Price",
title = "Square Feet Vs Price"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Looks like there are few outliers that might influence our model.
First, let’s fill the null rows with others and then inspect.
df$cityname[is.na(df$cityname) | df$cityname == ""] <- 'Others'
value_counts(df, cityname)
Since there are more than 1500 cities, I am going to reduce it to 3 categories. This way our model performs better.
Tire 1 (count > 50)
Tire 2 (count >=10 and < 50)
Tire 3 (count < 10)
city_counts <- df |>
count(cityname)
city_counts <- city_counts |>
mutate(
city_tier = case_when(
n > 50 ~ "Tier 1",
n >= 10 & n < 50 ~ "Tier 2",
n < 10 ~ "Tier 3"
)
)
df <- df |>
left_join(city_counts |> select(cityname, city_tier), by = "cityname")
ggplot(df, aes(x = city_tier, y = price)) +
geom_boxplot() +
labs(title = "Price vs City", x = "City", y = "Price") +
theme_minimal()
Creating different tires based on the mean rent.
Tire 1 (avg_rent > $3500)
Tire 2 (avg_rent >= $1500 & avg_rent <= $3500)
Tire 3 (avg_rent < $1500)
# Step 1: Calculate average rent per city
city_avg_rent <- df |>
group_by(cityname) |>
summarise(avg_rent = mean(price, na.rm = TRUE))
# Step 2: Assign tiers based on average rent
city_avg_rent <- city_avg_rent |>
mutate(
rent_tier = case_when(
avg_rent > 3500 ~ "Tier 1",
avg_rent >= 1500 & avg_rent <= 3500 ~ "Tier 2",
avg_rent < 1500 ~ "Tier 3"
)
)
# Step 3: Join the rent tier back to the original dataframe
df <- df |>
left_join(city_avg_rent %>% select(cityname, rent_tier), by = "cityname")
ggplot(df, aes(x = rent_tier, y = price)) +
geom_boxplot() +
labs(title = "Price vs City", x = "City", y = "Price") +
theme_minimal()
value_counts(df, rent_tier)
value_counts(df, city_tier)
We will also target encode city and state. This will be useful in Linear regression technique.
target_encode <- function(df, cat_cols, target_col) {
for (col in cat_cols) {
means <- aggregate(df[[target_col]], by = list(df[[col]]), FUN = mean, na.rm = TRUE)
names(means) <- c(col, paste0(col, "_te"))
df <- merge(df, means, by = col, all.x = TRUE)
#df[[col]] <- NULL # optionally drop original column
}
return(df)
}
df <- target_encode(df, "cityname", "price")
df <- target_encode(df, "state", "price")
We will also scale the values using min max scaler.
min_max_scale <- function(column) {
scaled <- (column - min(column, na.rm = TRUE)) /
(max(column, na.rm = TRUE) - min(column, na.rm = TRUE))
return(scaled)
}
columns_to_ignore <- c(
"id", "category", "title", "body", "fee", "currency",
"price_display", "price_type", "state", "address",
"latitude", "longitude", "source", "time", 'amenities',
"has_photo", 'cityname', 'pets_allowed', 'city_tier', "cityname_te", "state_te"
)
df_new <- df[, !(names(df) %in% columns_to_ignore)]
# Make sure 'price' is numeric
df_new$price <- as.numeric(df_new$price)
# Split into train and test sets
set.seed(1)
train_idx <- sample(seq_len(nrow(df_new)), size = 0.7 * nrow(df_new))
train_data <- df_new[train_idx, ]
test_data <- df_new[-train_idx, ]
# Fit a regression tree (since price is numeric)
tree_model <- rpart(price ~ ., data = train_data, method = "anova")
# Plot the tree
rpart.plot(tree_model)
# Predict on test data
pred <- predict(tree_model, newdata = test_data)
# Calculate RMSE
rmse_val <- rmse(test_data$price, pred)
# Calculate R²
r2_val <- R2(pred, test_data$price)
# Print results
cat("RMSE:", rmse_val, "\n")
## RMSE: 699.5394
cat("R_squared:", r2_val, "\n")
## R_squared: 0.4516386
names(df_new) <- make.names(names(df_new))
# Train-test split (70% train, 30% test)
set.seed(123)
train_idx <- sample(seq_len(nrow(df_new)), size = 0.7 * nrow(df_new))
train_data <- df_new[train_idx, ]
test_data <- df_new[-train_idx, ]
rf_model <- randomForest(
price ~ .,
data = train_data,
ntree = 500, # Number of trees
mtry = floor(sqrt(ncol(train_data) - 1)), # Number of variables tried at each split
importance = TRUE
)
# View model summary
print(rf_model)
##
## Call:
## randomForest(formula = price ~ ., data = train_data, ntree = 500, mtry = floor(sqrt(ncol(train_data) - 1)), importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 746867.8
## % Var explained: 39.88
# Variable Importance Plot
varImpPlot(rf_model)
# Predict on test data
predictions <- predict(rf_model, newdata = test_data)
# Calculate RMSE (Root Mean Squared Error)
rmse <- sqrt(mean((predictions - test_data$price)^2))
print(paste("Test RMSE:", round(rmse, 2)))
## [1] "Test RMSE: 623.42"
# Create tuning grid
tune_grid <- expand.grid(mtry = seq(2, sqrt(ncol(train_data)-1)*2, by = 1)) # trying different mtry values
# Cross-validation settings
train_control <- trainControl(
method = "cv", # Cross Validation
number = 5, # 5-fold CV
verboseIter = TRUE # Show progress
)
# Train Random Forest with tuning
set.seed(123)
rf_tuned <- train(
price ~ .,
data = train_data,
method = "rf",
tuneGrid = tune_grid,
trControl = train_control,
ntree = 500,
importance = TRUE
)
## + Fold1: mtry= 2
## - Fold1: mtry= 2
## + Fold1: mtry= 3
## - Fold1: mtry= 3
## + Fold1: mtry= 4
## - Fold1: mtry= 4
## + Fold1: mtry= 5
## - Fold1: mtry= 5
## + Fold1: mtry= 6
## - Fold1: mtry= 6
## + Fold1: mtry= 7
## - Fold1: mtry= 7
## + Fold1: mtry= 8
## - Fold1: mtry= 8
## + Fold1: mtry= 9
## - Fold1: mtry= 9
## + Fold1: mtry=10
## - Fold1: mtry=10
## + Fold1: mtry=11
## - Fold1: mtry=11
## + Fold2: mtry= 2
## - Fold2: mtry= 2
## + Fold2: mtry= 3
## - Fold2: mtry= 3
## + Fold2: mtry= 4
## - Fold2: mtry= 4
## + Fold2: mtry= 5
## - Fold2: mtry= 5
## + Fold2: mtry= 6
## - Fold2: mtry= 6
## + Fold2: mtry= 7
## - Fold2: mtry= 7
## + Fold2: mtry= 8
## - Fold2: mtry= 8
## + Fold2: mtry= 9
## - Fold2: mtry= 9
## + Fold2: mtry=10
## - Fold2: mtry=10
## + Fold2: mtry=11
## - Fold2: mtry=11
## + Fold3: mtry= 2
## - Fold3: mtry= 2
## + Fold3: mtry= 3
## - Fold3: mtry= 3
## + Fold3: mtry= 4
## - Fold3: mtry= 4
## + Fold3: mtry= 5
## - Fold3: mtry= 5
## + Fold3: mtry= 6
## - Fold3: mtry= 6
## + Fold3: mtry= 7
## - Fold3: mtry= 7
## + Fold3: mtry= 8
## - Fold3: mtry= 8
## + Fold3: mtry= 9
## - Fold3: mtry= 9
## + Fold3: mtry=10
## - Fold3: mtry=10
## + Fold3: mtry=11
## - Fold3: mtry=11
## + Fold4: mtry= 2
## - Fold4: mtry= 2
## + Fold4: mtry= 3
## - Fold4: mtry= 3
## + Fold4: mtry= 4
## - Fold4: mtry= 4
## + Fold4: mtry= 5
## - Fold4: mtry= 5
## + Fold4: mtry= 6
## - Fold4: mtry= 6
## + Fold4: mtry= 7
## - Fold4: mtry= 7
## + Fold4: mtry= 8
## - Fold4: mtry= 8
## + Fold4: mtry= 9
## - Fold4: mtry= 9
## + Fold4: mtry=10
## - Fold4: mtry=10
## + Fold4: mtry=11
## - Fold4: mtry=11
## + Fold5: mtry= 2
## - Fold5: mtry= 2
## + Fold5: mtry= 3
## - Fold5: mtry= 3
## + Fold5: mtry= 4
## - Fold5: mtry= 4
## + Fold5: mtry= 5
## - Fold5: mtry= 5
## + Fold5: mtry= 6
## - Fold5: mtry= 6
## + Fold5: mtry= 7
## - Fold5: mtry= 7
## + Fold5: mtry= 8
## - Fold5: mtry= 8
## + Fold5: mtry= 9
## - Fold5: mtry= 9
## + Fold5: mtry=10
## - Fold5: mtry=10
## + Fold5: mtry=11
## - Fold5: mtry=11
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 7 on full training set
# Best tuning parameter
print(rf_tuned$bestTune)
## mtry
## 6 7
# Full results
print(rf_tuned)
## Random Forest
##
## 7000 samples
## 33 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 5600, 5600, 5600, 5600, 5600
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 862.2460 0.4305304 398.4646
## 3 813.7618 0.4719169 368.1542
## 4 794.0612 0.4891121 356.4038
## 5 784.7608 0.4966200 350.7674
## 6 775.5746 0.5067759 347.2245
## 7 774.5047 0.5069861 346.9900
## 8 780.3102 0.4995471 348.5823
## 9 778.6647 0.5007279 348.4353
## 10 777.7674 0.5019659 348.9894
## 11 782.2437 0.4975384 350.9046
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 7.
# Plot performance vs. mtry
plot(rf_tuned)
actual <- test_data$price
predicted <- predict(rf_model, newdata = test_data)
mae <- mean(abs(actual - predicted))
mse <- mean((actual - predicted)^2)
rmse <- sqrt(mse)
rss <- sum((actual - predicted)^2)
tss <- sum((actual - mean(actual))^2)
r_squared <- 1 - rss/tss
cat("MAE:", mae, "\n")
## MAE: 347.5443
cat("MSE:", mse, "\n")
## MSE: 388648.7
cat("RMSE:", rmse, "\n")
## RMSE: 623.4169
cat("R-squared:", r_squared, "\n")
## R-squared: 0.596768
plot_vif <- function(model, threshold = 5) {
require(car)
require(ggplot2)
vif_values <- vif(model)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
ggplot(vif_df) +
geom_bar(mapping = aes(x = VIF, y = Variable), stat = "identity", fill = "steelblue") +
geom_vline(xintercept = threshold, linetype = "dashed", color = "red", linewidth = 1) +
labs(title = "VIF Values", x = "VIF", y = "Variable") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0),
axis.text.y = element_text(angle = 0)
)
}
plot_coefficients <- function(model) {
require(ggplot2)
coef_df <- data.frame(
Predictor = names(coef(model))[-1],
Coefficient = coef(model)[-1]
)
ggplot(coef_df, aes(x = reorder(Predictor, Coefficient), y = Coefficient)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Linear Regression Coefficients",
x = "Predictors",
y = "Coefficient Value"
) +
theme_minimal()
}
First we will get the required columns in a new dataframe just for linear regression.
df_lr <- df |>
select(
bathrooms,
bedrooms,
square_feet,
Cats,
Dogs,
Dishwasher,
Elevator,
`Patio/Deck`,
Pool,
Storage,
Refrigerator,
AC,
Basketball,
`Cable or Satellite`,
Gym,
`Internet Access`,
Clubhouse,
Parking,
`Garbage Disposal`,
Fireplace,
`Washer Dryer`,
Playground,
Gated,
TV,
`Hot Tub`,
Tennis,
`Wood Floors`,
View,
Alarm,
Doorman,
Luxury,
Golf,
cityname_te,
state_te,
price
)
df_lr
# List of columns to exclude
exclude_cols <- c("bathrooms", "bedrooms", "square_feet", "cityname_te", "state_te", "price")
cols_to_convert <- setdiff(names(df_lr), exclude_cols)
df_lr[cols_to_convert] <- lapply(df_lr[cols_to_convert], function(x) ifelse(x == "Yes", 1, 0))
df_lr$scaled_price <- min_max_scale(df_lr$price)
df_lr$scaled_city <- min_max_scale(df_lr$cityname_te)
df_lr$scaled_state <- min_max_scale(df_lr$state_te)
df_lr$scaled_sq_feet <- min_max_scale(df_lr$square_feet)
set.seed(42) # For reproducibility
train_indices <- createDataPartition(df_lr$price, p = 0.8, list = FALSE)
train_set <- df_lr[train_indices, ]
test_set <- df_lr[-train_indices, ]
model1 <- lm(price ~ cityname_te+state_te+square_feet+bedrooms+bathrooms+Dishwasher + Elevator + `Patio/Deck` + Pool + Storage + Refrigerator + AC + Basketball + `Cable or Satellite` + Gym + `Internet Access` + Clubhouse + Parking + `Garbage Disposal` + Fireplace + `Washer Dryer` + Playground + Gated + TV + `Hot Tub` + Tennis + `Wood Floors` + View + Alarm + Doorman + Luxury + Golf + Cats + Dogs
, data = train_set)
summary(model1)
##
## Call:
## lm(formula = price ~ cityname_te + state_te + square_feet + bedrooms +
## bathrooms + Dishwasher + Elevator + `Patio/Deck` + Pool +
## Storage + Refrigerator + AC + Basketball + `Cable or Satellite` +
## Gym + `Internet Access` + Clubhouse + Parking + `Garbage Disposal` +
## Fireplace + `Washer Dryer` + Playground + Gated + TV + `Hot Tub` +
## Tennis + `Wood Floors` + View + Alarm + Doorman + Luxury +
## Golf + Cats + Dogs, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6252.2 -234.1 -13.4 185.9 14185.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.043e+02 2.636e+01 -30.516 < 2e-16 ***
## cityname_te 9.010e-01 8.587e-03 104.930 < 2e-16 ***
## state_te 1.198e-01 1.429e-02 8.381 < 2e-16 ***
## square_feet 1.984e-01 1.175e-02 16.882 < 2e-16 ***
## bedrooms 7.428e+01 9.587e+00 7.748 1.05e-14 ***
## bathrooms 3.176e+02 1.514e+01 20.981 < 2e-16 ***
## Dishwasher 4.188e+01 1.964e+01 2.132 0.033034 *
## Elevator 1.360e+02 2.667e+01 5.097 3.53e-07 ***
## `Patio/Deck` 3.859e+00 1.778e+01 0.217 0.828223
## Pool 2.308e+01 1.625e+01 1.420 0.155692
## Storage 2.328e+01 1.885e+01 1.235 0.216746
## Refrigerator -1.957e+01 2.093e+01 -0.935 0.349817
## AC -4.886e+00 2.819e+01 -0.173 0.862394
## Basketball -6.540e+01 3.691e+01 -1.772 0.076444 .
## `Cable or Satellite` -5.712e+01 2.410e+01 -2.370 0.017791 *
## Gym 2.352e+00 2.225e+01 0.106 0.915833
## `Internet Access` 3.825e+01 2.332e+01 1.640 0.100980
## Clubhouse 9.360e-01 2.206e+01 0.042 0.966151
## Parking 5.825e+01 1.503e+01 3.876 0.000107 ***
## `Garbage Disposal` -9.007e+01 2.384e+01 -3.778 0.000159 ***
## Fireplace -3.886e+01 2.193e+01 -1.772 0.076487 .
## `Washer Dryer` -9.379e+00 2.398e+01 -0.391 0.695745
## Playground -8.943e+01 2.527e+01 -3.539 0.000404 ***
## Gated -8.179e+01 3.011e+01 -2.716 0.006622 **
## TV 5.591e+01 4.376e+01 1.278 0.201407
## `Hot Tub` 1.190e+00 3.514e+01 0.034 0.972982
## Tennis -1.552e+00 3.136e+01 -0.049 0.960544
## `Wood Floors` 7.003e+01 3.356e+01 2.087 0.036939 *
## View 6.654e+01 5.106e+01 1.303 0.192483
## Alarm -1.937e+01 1.191e+02 -0.163 0.870793
## Doorman -3.850e+02 1.192e+02 -3.229 0.001247 **
## Luxury 1.422e+02 1.716e+02 0.829 0.407348
## Golf 1.829e+02 1.417e+02 1.291 0.196823
## Cats -4.412e+01 2.664e+01 -1.656 0.097667 .
## Dogs 3.554e+01 2.632e+01 1.350 0.177093
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 541.2 on 7966 degrees of freedom
## Multiple R-squared: 0.7515, Adjusted R-squared: 0.7504
## F-statistic: 708.4 on 34 and 7966 DF, p-value: < 2.2e-16
From the above summary we can see that the below predictors are insignificant, since their p-value is less than threshold.
Patio/Deck, Pool, Storage, Refrigerator, AC, Basketball, Gym, Internet Access, Clubhouse, TV, Hot Tub, Tennis, View, Alarm, Luxury, Golf, Cats, Dogs.
plot_coefficients(model1)
plot_vif(model1)
# mean squared error
mse <- mean(model1$residuals ^ 2)
# root mean squared error
rmse <- sqrt(mse)
cat(mse, rmse)
## 291606.9 540.0064
summary(model1)$r.squared
## [1] 0.7514725
model2 <- lm(price ~ cityname_te + state_te + square_feet + bedrooms + bathrooms +
Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
Playground + Gated + `Wood Floors` + Doorman,
data = train_set)
summary(model2)
##
## Call:
## lm(formula = price ~ cityname_te + state_te + square_feet + bedrooms +
## bathrooms + Dishwasher + Elevator + `Cable or Satellite` +
## Parking + `Garbage Disposal` + Playground + Gated + `Wood Floors` +
## Doorman, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6223.4 -236.7 -10.4 184.5 14189.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.040e+02 2.459e+01 -32.696 < 2e-16 ***
## cityname_te 9.023e-01 8.554e-03 105.483 < 2e-16 ***
## state_te 1.180e-01 1.415e-02 8.333 < 2e-16 ***
## square_feet 1.965e-01 1.171e-02 16.782 < 2e-16 ***
## bedrooms 7.202e+01 9.469e+00 7.606 3.15e-14 ***
## bathrooms 3.199e+02 1.499e+01 21.339 < 2e-16 ***
## Dishwasher 3.110e+01 1.597e+01 1.947 0.051561 .
## Elevator 1.446e+02 2.620e+01 5.517 3.55e-08 ***
## `Cable or Satellite` -4.520e+01 2.011e+01 -2.248 0.024628 *
## Parking 6.778e+01 1.421e+01 4.770 1.87e-06 ***
## `Garbage Disposal` -8.381e+01 2.256e+01 -3.715 0.000205 ***
## Playground -8.928e+01 2.320e+01 -3.849 0.000119 ***
## Gated -7.626e+01 2.901e+01 -2.628 0.008598 **
## `Wood Floors` 7.528e+01 3.264e+01 2.306 0.021109 *
## Doorman -3.509e+02 1.182e+02 -2.969 0.003001 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 541.3 on 7986 degrees of freedom
## Multiple R-squared: 0.7507, Adjusted R-squared: 0.7503
## F-statistic: 1718 on 14 and 7986 DF, p-value: < 2.2e-16
plot_coefficients(model2)
plot_vif(model2)
# mean squared error
mse <- mean(model2$residuals ^ 2)
# root mean squared error
rmse <- sqrt(mse)
cat(mse, rmse)
## 292498.5 540.8313
summary(model2)$r.squared
## [1] 0.7507127
model3 <- lm(scaled_price ~ scaled_city+scaled_state+scaled_sq_feet+bedrooms+bathrooms +
Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
Playground + Gated + `Wood Floors` + Doorman, data = train_set)
summary(model3)
##
## Call:
## lm(formula = scaled_price ~ scaled_city + scaled_state + scaled_sq_feet +
## bedrooms + bathrooms + Dishwasher + Elevator + `Cable or Satellite` +
## Parking + `Garbage Disposal` + Playground + Gated + `Wood Floors` +
## Doorman, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.118995 -0.004527 -0.000199 0.003528 0.271303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0137581 0.0003782 -36.382 < 2e-16 ***
## scaled_city 0.9023487 0.0085545 105.483 < 2e-16 ***
## scaled_state 0.0067120 0.0008055 8.333 < 2e-16 ***
## scaled_sq_feet 0.1498797 0.0089312 16.782 < 2e-16 ***
## bedrooms 0.0013771 0.0001810 7.606 3.15e-14 ***
## bathrooms 0.0061171 0.0002867 21.339 < 2e-16 ***
## Dishwasher 0.0005946 0.0003054 1.947 0.051561 .
## Elevator 0.0027639 0.0005009 5.517 3.55e-08 ***
## `Cable or Satellite` -0.0008642 0.0003845 -2.248 0.024628 *
## Parking 0.0012961 0.0002717 4.770 1.87e-06 ***
## `Garbage Disposal` -0.0016026 0.0004314 -3.715 0.000205 ***
## Playground -0.0017071 0.0004435 -3.849 0.000119 ***
## Gated -0.0014581 0.0005548 -2.628 0.008598 **
## `Wood Floors` 0.0014393 0.0006240 2.306 0.021109 *
## Doorman -0.0067095 0.0022602 -2.969 0.003001 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01035 on 7986 degrees of freedom
## Multiple R-squared: 0.7507, Adjusted R-squared: 0.7503
## F-statistic: 1718 on 14 and 7986 DF, p-value: < 2.2e-16
plot_coefficients(model3)
plot_vif(model3)
# mean squared error
mse <- mean(model3$residuals ^ 2)
# root mean squared error
rmse <- sqrt(mse)
cat(mse, rmse)
## 0.0001069351 0.01034094
summary(model3)$r.squared
## [1] 0.7507127
library(caret)
# Define cross-validation method: 5-fold CV (you can change to 10, etc.)
train_control <- trainControl(method = "cv", number = 5)
# Fit the model with train()
cv_model <- train(
price ~ cityname_te + state_te + square_feet + bedrooms + bathrooms +
Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
Playground + Gated + `Wood Floors` + Doorman,
data = train_set,
method = "lm",
trControl = train_control
)
print(cv_model)
## Linear Regression
##
## 8001 samples
## 14 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6401, 6399, 6402, 6402, 6400
## Resampling results:
##
## RMSE Rsquared MAE
## 591.2377 0.6714998 312.0822
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
library(caret)
# Define cross-validation method: 5-fold CV (you can change to 10, etc.)
train_control <- trainControl(method = "cv", number = 5)
# Fit the model with train()
cv_model <- train(
scaled_price ~ scaled_city+scaled_state+scaled_sq_feet + bedrooms + bathrooms +
Dishwasher + Elevator + `Cable or Satellite` + Parking + `Garbage Disposal` +
Playground + Gated + `Wood Floors` + Doorman,
data = train_set,
method = "lm",
trControl = train_control
)
print(cv_model)
## Linear Regression
##
## 8001 samples
## 14 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 6400, 6401, 6401, 6401, 6401
## Resampling results:
##
## RMSE Rsquared MAE
## 0.01148586 0.6690222 0.005966564
##
## Tuning parameter 'intercept' was held constant at a value of TRUE