#Author: Joel Trematore
#lOAD LIBRARIES
library(tidyverse)
library(caret)
library(dplyr)
library(corrplot)
#library(lme4)
#library(lmerTest)
#library(effects)
for(i in 1:length(data_files)) {
# Head of for-loop
assign(paste0("data_", i),
# Read and store data frames from the file location with the program file and data files
read.csv(paste0("C:/Users/joeyj/Desktop/CAPSTONE/groupcsv/good_code/markdown/",data_files[i])))
}
#CREATE A LIST OF FRAMES TO READ
list_of_frames <- list(data_1,data_2,data_3,data_4,data_5,
data_6,data_7,data_8,data_9,data_10)
#CREATE EMPTY DATAFRAME TO FILL
info_frame <- data.frame()
info_frame_two <-data.frame()
#CREATE LIST TO CONTAIN MODEL INFO
list_of_models <- list()
list_of_model_summary <- list()
######################################################################## PART 1
# THIS SECTION USES ALL FEATURES FROM THE FRAME AND CREATES LINEAR MODELS FROM EACH
# ZIP CODE FRAME
i = 1
#CREATE ANOTHER FOR LOOP SO WE CAN CREATE THREE DIFFERENT MODELS
for (frame in list_of_frames){
set.seed(123)
partition <- createDataPartition(frame$Listing.price, p = 0.8, list = FALSE)
#SPLIT DATA INTO TRAIN AND TEST FRAMES
train <- frame[partition,]
test <- frame[-partition,]
#CREATE FIRST MODEL WITH ALL FEATURES
model_one_train <- lm(as.numeric(Listing.price) ~ Year.built+Type+ Bedrooms+Total.baths+
X1.2.Baths + X..of.garage.spaces+
X..of.carports+ Lot.acres+ Lot.sqft+ Taxes+
Main.house.sqft+ X..of.Fireplaces+Pool+Status
,data = train)
#CREATE MODEL SUMMARY
model_one_train_summary <- summary(model_one_train)
#SAVE THE PREDICTED VALUES TO THE TEST FRAME
test$predicted_p <- predict(model_one_train,test)
list_of_model_summary <- append(list_of_model_summary,list(model_one_train_summary))
list_of_models <- append(list_of_models,list(model_one_train))
#SAVE PREDICTIONS INTO TEST FRAME
test$predicted_p <- predict(model_one_train,test)
#TAKE THE MEAN AND SQUARE THE VALUES FOR THE MSE
mse <- mean((test$Listing.price - test$predicted_p)^2)
off_by <- sqrt(mse)
r_squared <- model_one_train_summary$r.squared*100
zipcode <- frame$Zip[1]
#ROW/COLUMN
info_frame[i,1] <- mse
info_frame[i,2] <- off_by
info_frame[i,3] <- r_squared
info_frame[i,4] <- zipcode
info_frame[i,5] <- nrow(frame)
info_frame[i,6] <- "Non AIC"
i <- i +1
}#outer for loop
## [1] "lm" "lm" "lm" "lm" "lm" "lm" "lm" "lm" "lm" "lm"
######################################################################## PART 2
#MODEL 1
#THIS SECTION USES AIC FUNCTION TO CHOOSE FEATURES
#CREATE PARTITION AND SET SEED
set.seed(123)
partition_two <- createDataPartition(data_2$Listing.price, p = 0.8, list = FALSE)
#SPLIT INTO TEST AND TRAIN
train_two <- data_2[partition_two,]
test_two <- data_2[-partition_two,]
#CREATE MODEL WITH FIRST DATA SET
model_set_one <- lm(as.numeric(Listing.price) ~ Year.built+Type+ Bedrooms+Total.baths+
X1.2.Baths + X..of.garage.spaces+
X..of.carports+ Lot.acres+ Lot.sqft+ Taxes+
Main.house.sqft+ X..of.Fireplaces+Pool+Status
,data = data_2)
#USE STEP FUNCTION TO SEE THE NEW MODEL WE GET
step_model_final_one <- step(model_set_one)
## Start: AIC=3100.67
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + X..of.garage.spaces + X..of.carports + Lot.acres +
## Lot.sqft + Taxes + Main.house.sqft + X..of.Fireplaces + Pool +
## Status
##
## Df Sum of Sq RSS AIC
## - X..of.carports 1 1.3583e+08 5.4602e+10 3099.1
## - Taxes 1 1.6457e+08 5.4631e+10 3099.1
## - X..of.garage.spaces 1 1.7407e+08 5.4640e+10 3099.2
## - Bedrooms 1 2.0408e+08 5.4670e+10 3099.3
## - X1.2.Baths 1 2.6811e+08 5.4734e+10 3099.4
## - Lot.sqft 1 6.0001e+08 5.5066e+10 3100.4
## - Lot.acres 1 6.7326e+08 5.5139e+10 3100.6
## <none> 5.4466e+10 3100.7
## - Total.baths 1 1.4723e+09 5.5938e+10 3102.8
## - X..of.Fireplaces 1 1.5649e+09 5.6031e+10 3103.1
## - Pool 1 4.4797e+09 5.8946e+10 3111.0
## - Type 1 9.2542e+09 6.3720e+10 3123.2
## - Status 2 1.2170e+10 6.6636e+10 3128.1
## - Year.built 1 1.1409e+10 6.5875e+10 3128.3
## - Main.house.sqft 1 2.7833e+10 8.2299e+10 3163.1
##
## Step: AIC=3099.06
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + X..of.garage.spaces + Lot.acres + Lot.sqft +
## Taxes + Main.house.sqft + X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - X..of.garage.spaces 1 1.0179e+08 5.4704e+10 3097.4
## - Bedrooms 1 1.6282e+08 5.4765e+10 3097.5
## - Taxes 1 2.0539e+08 5.4807e+10 3097.6
## - X1.2.Baths 1 2.5774e+08 5.4860e+10 3097.8
## - Lot.sqft 1 5.7002e+08 5.5172e+10 3098.7
## - Lot.acres 1 6.4391e+08 5.5246e+10 3098.9
## <none> 5.4602e+10 3099.1
## - Total.baths 1 1.4347e+09 5.6037e+10 3101.1
## - X..of.Fireplaces 1 1.4359e+09 5.6038e+10 3101.1
## - Pool 1 4.3559e+09 5.8958e+10 3109.0
## - Type 1 1.0414e+10 6.5016e+10 3124.3
## - Status 2 1.2156e+10 6.6758e+10 3126.4
## - Year.built 1 1.1315e+10 6.5917e+10 3126.4
## - Main.house.sqft 1 2.8495e+10 8.3097e+10 3162.6
##
## Step: AIC=3097.35
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + Lot.acres + Lot.sqft + Taxes + Main.house.sqft +
## X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - Bedrooms 1 1.9425e+08 5.4898e+10 3095.9
## - Taxes 1 2.0627e+08 5.4910e+10 3095.9
## - X1.2.Baths 1 3.1787e+08 5.5022e+10 3096.3
## - Lot.sqft 1 6.0055e+08 5.5304e+10 3097.1
## - Lot.acres 1 6.7685e+08 5.5381e+10 3097.3
## <none> 5.4704e+10 3097.4
## - X..of.Fireplaces 1 1.4365e+09 5.6140e+10 3099.4
## - Total.baths 1 1.5899e+09 5.6294e+10 3099.8
## - Pool 1 4.2846e+09 5.8988e+10 3107.1
## - Status 2 1.2056e+10 6.6760e+10 3124.4
## - Type 1 1.2252e+10 6.6956e+10 3126.9
## - Year.built 1 1.8273e+10 7.2977e+10 3140.3
## - Main.house.sqft 1 2.9580e+10 8.4284e+10 3162.8
##
## Step: AIC=3095.91
## as.numeric(Listing.price) ~ Year.built + Type + Total.baths +
## X1.2.Baths + Lot.acres + Lot.sqft + Taxes + Main.house.sqft +
## X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - X1.2.Baths 1 2.4810e+08 5.5146e+10 3094.6
## - Lot.sqft 1 5.2078e+08 5.5419e+10 3095.4
## - Lot.acres 1 5.9345e+08 5.5491e+10 3095.6
## - Taxes 1 5.9627e+08 5.5494e+10 3095.6
## <none> 5.4898e+10 3095.9
## - X..of.Fireplaces 1 1.2829e+09 5.6181e+10 3097.5
## - Total.baths 1 1.6165e+09 5.6515e+10 3098.4
## - Pool 1 4.0911e+09 5.8989e+10 3105.1
## - Status 2 1.2176e+10 6.7074e+10 3123.2
## - Type 1 1.2418e+10 6.7316e+10 3125.7
## - Year.built 1 1.8166e+10 7.3064e+10 3138.5
## - Main.house.sqft 1 5.0900e+10 1.0580e+11 3196.3
##
## Step: AIC=3094.61
## as.numeric(Listing.price) ~ Year.built + Type + Total.baths +
## Lot.acres + Lot.sqft + Taxes + Main.house.sqft + X..of.Fireplaces +
## Pool + Status
##
## Df Sum of Sq RSS AIC
## - Lot.sqft 1 5.7897e+08 5.5725e+10 3094.2
## - Taxes 1 5.8917e+08 5.5735e+10 3094.3
## - Lot.acres 1 6.5579e+08 5.5802e+10 3094.5
## <none> 5.5146e+10 3094.6
## - X..of.Fireplaces 1 1.3567e+09 5.6503e+10 3096.4
## - Total.baths 1 1.7079e+09 5.6854e+10 3097.4
## - Pool 1 4.2314e+09 5.9378e+10 3104.1
## - Status 2 1.2018e+10 6.7165e+10 3121.4
## - Type 1 1.2246e+10 6.7392e+10 3123.9
## - Year.built 1 1.8474e+10 7.3620e+10 3137.7
## - Main.house.sqft 1 6.0671e+10 1.1582e+11 3208.4
##
## Step: AIC=3094.24
## as.numeric(Listing.price) ~ Year.built + Type + Total.baths +
## Lot.acres + Taxes + Main.house.sqft + X..of.Fireplaces +
## Pool + Status
##
## Df Sum of Sq RSS AIC
## <none> 5.5725e+10 3094.2
## - Taxes 1 9.4505e+08 5.6670e+10 3094.9
## - X..of.Fireplaces 1 1.3425e+09 5.7068e+10 3096.0
## - Total.baths 1 1.7774e+09 5.7503e+10 3097.1
## - Pool 1 3.6785e+09 5.9404e+10 3102.2
## - Type 1 1.2044e+10 6.7769e+10 3122.8
## - Status 2 1.3242e+10 6.8967e+10 3123.5
## - Year.built 1 1.8447e+10 7.4172e+10 3136.8
## - Lot.acres 1 2.9126e+10 8.4851e+10 3157.8
## - Main.house.sqft 1 6.1907e+10 1.1763e+11 3208.8
#CREATE MODEL GIVEN BY STEP FUNCTION
step_model_one <- lm(as.numeric(Listing.price)~ X..of.garage.spaces+
Status +
Pool +
Taxes +
Total.baths+
Type +
Bedrooms +
Year.built +
Lot.sqft +
Main.house.sqft,data = train_two)
step_model_summary_one <- summary(step_model_one)
test_two$predicted_p <- predict(step_model_one,test_two)
step_model_Listing_price_mse <- mean((test_two$Listing.price - test_two$predicted_p)^2)
info_frame_two[1,1] <- step_model_Listing_price_mse
info_frame_two[1,2] <- sqrt(step_model_Listing_price_mse)
info_frame_two[1,3] <- step_model_summary_one$r.squared*100
info_frame_two[1,4] <- data_2$Zip[1]
info_frame_two[1,5] <- nrow(data_2)
info_frame_two[1,6] <- "AIC"
######################################################################## PART 2
#MODEL 2
set.seed(123)
partition_three <- createDataPartition(data_3$Listing.price, p = 0.8, list = FALSE)
#SPLIT INTO TEST AND TRAIN
train_three <- data_3[partition_two,]
test_three <- data_3[-partition_two,]
#CREATE MODEL WITH ORIGINAL DATAFRAME
model_set_two <- lm(as.numeric(Listing.price) ~ Year.built+Type+ Bedrooms+Total.baths+
X1.2.Baths + X..of.garage.spaces+
X..of.carports+ Lot.acres+ Lot.sqft+ Taxes+
Main.house.sqft+ X..of.Fireplaces+Pool+Status
,data = data_3)
#USE STEP FUNCTION TO SEE THE NEW MODEL WE GET
step_model_final_two <- step(model_set_two)
## Start: AIC=4064.88
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + X..of.garage.spaces + X..of.carports + Lot.acres +
## Lot.sqft + Taxes + Main.house.sqft + X..of.Fireplaces + Pool +
## Status
##
## Df Sum of Sq RSS AIC
## - X1.2.Baths 1 5.8226e+07 4.3449e+11 4062.9
## - Total.baths 1 8.6883e+08 4.3530e+11 4063.3
## - Lot.sqft 1 1.4819e+09 4.3591e+11 4063.5
## - Taxes 1 1.4906e+09 4.3592e+11 4063.5
## - Lot.acres 1 2.3754e+09 4.3681e+11 4063.9
## <none> 4.3443e+11 4064.9
## - X..of.carports 1 5.4142e+09 4.3985e+11 4065.2
## - Year.built 1 7.3797e+09 4.4181e+11 4066.0
## - Status 2 1.8812e+10 4.5324e+11 4068.8
## - Type 1 1.4411e+10 4.4884e+11 4069.0
## - X..of.Fireplaces 1 1.6814e+10 4.5125e+11 4070.0
## - Bedrooms 1 2.0159e+10 4.5459e+11 4071.4
## - Pool 1 6.2881e+10 4.9731e+11 4088.2
## - X..of.garage.spaces 1 9.4229e+10 5.2866e+11 4099.6
## - Main.house.sqft 1 4.0534e+11 8.3978e+11 4186.1
##
## Step: AIC=4062.9
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X..of.garage.spaces + X..of.carports + Lot.acres + Lot.sqft +
## Taxes + Main.house.sqft + X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - Taxes 1 1.5215e+09 4.3601e+11 4061.6
## - Lot.sqft 1 1.6956e+09 4.3619e+11 4061.6
## - Total.baths 1 2.6158e+09 4.3711e+11 4062.0
## - Lot.acres 1 2.6743e+09 4.3716e+11 4062.1
## <none> 4.3449e+11 4062.9
## - X..of.carports 1 5.5346e+09 4.4002e+11 4063.3
## - Year.built 1 7.3215e+09 4.4181e+11 4064.0
## - Status 2 1.8825e+10 4.5332e+11 4066.8
## - Type 1 1.4353e+10 4.4884e+11 4067.0
## - X..of.Fireplaces 1 1.6877e+10 4.5137e+11 4068.0
## - Bedrooms 1 2.3343e+10 4.5783e+11 4070.7
## - Pool 1 6.2830e+10 4.9732e+11 4086.2
## - X..of.garage.spaces 1 1.0721e+11 5.4170e+11 4102.1
## - Main.house.sqft 1 4.0750e+11 8.4199e+11 4184.6
##
## Step: AIC=4061.56
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X..of.garage.spaces + X..of.carports + Lot.acres + Lot.sqft +
## Main.house.sqft + X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - Lot.sqft 1 1.0808e+09 4.3709e+11 4060.0
## - Lot.acres 1 1.9003e+09 4.3791e+11 4060.4
## - Total.baths 1 3.0552e+09 4.3907e+11 4060.9
## <none> 4.3601e+11 4061.6
## - X..of.carports 1 6.7000e+09 4.4271e+11 4062.4
## - Type 1 1.2968e+10 4.4898e+11 4065.0
## - X..of.Fireplaces 1 1.5817e+10 4.5183e+11 4066.2
## - Status 2 2.2617e+10 4.5863e+11 4067.0
## - Year.built 1 2.1246e+10 4.5726e+11 4068.5
## - Bedrooms 1 2.5299e+10 4.6131e+11 4070.1
## - Pool 1 6.1677e+10 4.9769e+11 4084.3
## - X..of.garage.spaces 1 1.0989e+11 5.4590e+11 4101.6
## - Main.house.sqft 1 4.1188e+11 8.4790e+11 4183.9
##
## Step: AIC=4060.02
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X..of.garage.spaces + X..of.carports + Lot.acres + Main.house.sqft +
## X..of.Fireplaces + Pool + Status
##
## Df Sum of Sq RSS AIC
## - Total.baths 1 2.9553e+09 4.4005e+11 4059.3
## <none> 4.3709e+11 4060.0
## - X..of.carports 1 6.8819e+09 4.4397e+11 4060.9
## - Type 1 1.2845e+10 4.4994e+11 4063.4
## - Status 2 2.3106e+10 4.6020e+11 4065.7
## - X..of.Fireplaces 1 1.8327e+10 4.5542e+11 4065.7
## - Year.built 1 2.4203e+10 4.6130e+11 4068.1
## - Bedrooms 1 2.5700e+10 4.6279e+11 4068.7
## - Pool 1 6.0737e+10 4.9783e+11 4082.4
## - X..of.garage.spaces 1 1.0884e+11 5.4593e+11 4099.6
## - Main.house.sqft 1 4.1458e+11 8.5167e+11 4182.8
## - Lot.acres 1 4.3186e+11 8.6895e+11 4186.5
##
## Step: AIC=4059.28
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + X..of.garage.spaces +
## X..of.carports + Lot.acres + Main.house.sqft + X..of.Fireplaces +
## Pool + Status
##
## Df Sum of Sq RSS AIC
## <none> 4.4005e+11 4059.3
## - X..of.carports 1 6.4588e+09 4.4651e+11 4060.0
## - Type 1 1.1549e+10 4.5160e+11 4062.1
## - Status 2 2.4307e+10 4.6435e+11 4065.3
## - X..of.Fireplaces 1 1.9517e+10 4.5956e+11 4065.4
## - Year.built 1 2.4584e+10 4.6463e+11 4067.4
## - Bedrooms 1 2.9081e+10 4.6913e+11 4069.2
## - Pool 1 6.4197e+10 5.0424e+11 4082.7
## - X..of.garage.spaces 1 1.0669e+11 5.4674e+11 4097.9
## - Lot.acres 1 4.3471e+11 8.7475e+11 4185.8
## - Main.house.sqft 1 4.9839e+11 9.3844e+11 4198.9
#CREATE MODEL GIVEN BY STEP FUNCTION
step_model_two <- lm(as.numeric(Listing.price)~ X..of.Fireplaces+
Status +
Pool +
Taxes +
Total.baths+
Type +
Bedrooms +
Year.built +
Lot.acres +
Main.house.sqft,data = train_three)
step_model_summary_two <- summary(step_model_two)
test_three$predicted_p <- predict(step_model_two,test_three)
step_model_Listing_price_mse <- mean((test_three$Listing.price - test_three$predicted_p)^2)
info_frame_two[2,1] <- step_model_Listing_price_mse
info_frame_two[2,2] <- sqrt(step_model_Listing_price_mse)
info_frame_two[2,3] <- step_model_summary_two$r.squared*100
info_frame_two[2,4] <- data_3$Zip[1]
info_frame_two[2,5] <- nrow(data_3)
info_frame_two[2,6] <- "AIC"
######################################################################## PART 2
# MODEL 3
set.seed(123)
partition_four <- createDataPartition(data_4$Listing.price, p = 0.8, list = FALSE)
#SPLIT INTO TEST AND TRAIN
train_four <- data_4[partition_two,]
test_four <- data_4[-partition_two,]
#CREATE MODEL WITH ORIGINAL DATAFRAME
model_set_three <- lm(as.numeric(Listing.price) ~ Year.built+Type+ Bedrooms+Total.baths+
X1.2.Baths + X..of.garage.spaces+
X..of.carports+ Lot.acres+ Lot.sqft+ Taxes+
Main.house.sqft+ X..of.Fireplaces+Pool+Status
,data = data_4)
#USE STEP FUNCTION TO SEE THE NEW MODEL WE GET
step_model_final_three <- step(model_set_three)
## Start: AIC=4857.7
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + X..of.garage.spaces + X..of.carports + Lot.acres +
## Lot.sqft + Taxes + Main.house.sqft + X..of.Fireplaces + Pool +
## Status
##
## Df Sum of Sq RSS AIC
## - Status 2 1.6680e+09 1.6312e+12 4853.9
## - X1.2.Baths 1 4.1591e+08 1.6300e+12 4855.8
## - Total.baths 1 9.7112e+08 1.6306e+12 4855.8
## - Taxes 1 5.4957e+09 1.6351e+12 4856.4
## <none> 1.6296e+12 4857.7
## - Lot.acres 1 2.8093e+10 1.6577e+12 4859.3
## - Lot.sqft 1 3.0421e+10 1.6600e+12 4859.6
## - X..of.garage.spaces 1 3.4494e+10 1.6641e+12 4860.1
## - Bedrooms 1 3.7124e+10 1.6667e+12 4860.5
## - X..of.carports 1 4.6953e+10 1.6765e+12 4861.7
## - Pool 1 1.0458e+11 1.7342e+12 4868.9
## - Type 1 1.0941e+11 1.7390e+12 4869.5
## - X..of.Fireplaces 1 1.3004e+11 1.7596e+12 4872.0
## - Year.built 1 2.2338e+11 1.8530e+12 4882.9
## - Main.house.sqft 1 4.0092e+11 2.0305e+12 4902.3
##
## Step: AIC=4853.92
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X1.2.Baths + X..of.garage.spaces + X..of.carports + Lot.acres +
## Lot.sqft + Taxes + Main.house.sqft + X..of.Fireplaces + Pool
##
## Df Sum of Sq RSS AIC
## - X1.2.Baths 1 4.7160e+08 1.6317e+12 4852.0
## - Total.baths 1 7.8025e+08 1.6320e+12 4852.0
## - Taxes 1 7.1318e+09 1.6384e+12 4852.8
## <none> 1.6312e+12 4853.9
## - Lot.acres 1 2.8238e+10 1.6595e+12 4855.6
## - Lot.sqft 1 3.0573e+10 1.6618e+12 4855.9
## - X..of.garage.spaces 1 3.3837e+10 1.6651e+12 4856.3
## - Bedrooms 1 4.0543e+10 1.6718e+12 4857.1
## - X..of.carports 1 4.7968e+10 1.6792e+12 4858.1
## - Pool 1 1.0310e+11 1.7344e+12 4864.9
## - Type 1 1.0957e+11 1.7408e+12 4865.7
## - X..of.Fireplaces 1 1.3045e+11 1.7617e+12 4868.2
## - Year.built 1 2.3734e+11 1.8686e+12 4880.7
## - Main.house.sqft 1 4.1035e+11 2.0416e+12 4899.5
##
## Step: AIC=4851.98
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + Total.baths +
## X..of.garage.spaces + X..of.carports + Lot.acres + Lot.sqft +
## Taxes + Main.house.sqft + X..of.Fireplaces + Pool
##
## Df Sum of Sq RSS AIC
## - Total.baths 1 2.6798e+09 1.6344e+12 4850.3
## - Taxes 1 6.7869e+09 1.6385e+12 4850.9
## <none> 1.6317e+12 4852.0
## - Lot.acres 1 2.8121e+10 1.6598e+12 4853.6
## - Lot.sqft 1 3.0446e+10 1.6622e+12 4853.9
## - X..of.garage.spaces 1 3.5015e+10 1.6667e+12 4854.5
## - Bedrooms 1 4.4818e+10 1.6765e+12 4855.7
## - X..of.carports 1 4.7497e+10 1.6792e+12 4856.1
## - Pool 1 1.0313e+11 1.7348e+12 4863.0
## - Type 1 1.0930e+11 1.7410e+12 4863.7
## - X..of.Fireplaces 1 1.3057e+11 1.7623e+12 4866.3
## - Year.built 1 2.4463e+11 1.8764e+12 4879.6
## - Main.house.sqft 1 4.0989e+11 2.0416e+12 4897.5
##
## Step: AIC=4850.33
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + X..of.garage.spaces +
## X..of.carports + Lot.acres + Lot.sqft + Taxes + Main.house.sqft +
## X..of.Fireplaces + Pool
##
## Df Sum of Sq RSS AIC
## - Taxes 1 8.5227e+09 1.6429e+12 4849.4
## <none> 1.6344e+12 4850.3
## - Lot.acres 1 2.9379e+10 1.6638e+12 4852.1
## - Lot.sqft 1 3.1751e+10 1.6662e+12 4852.4
## - X..of.garage.spaces 1 3.2796e+10 1.6672e+12 4852.5
## - Bedrooms 1 4.2678e+10 1.6771e+12 4853.8
## - X..of.carports 1 4.7548e+10 1.6819e+12 4854.4
## - Pool 1 1.0299e+11 1.7374e+12 4861.3
## - Type 1 1.1224e+11 1.7466e+12 4862.4
## - X..of.Fireplaces 1 1.2959e+11 1.7640e+12 4864.5
## - Year.built 1 2.4878e+11 1.8832e+12 4878.4
## - Main.house.sqft 1 7.3517e+11 2.3696e+12 4927.1
##
## Step: AIC=4849.43
## as.numeric(Listing.price) ~ Year.built + Type + Bedrooms + X..of.garage.spaces +
## X..of.carports + Lot.acres + Lot.sqft + Main.house.sqft +
## X..of.Fireplaces + Pool
##
## Df Sum of Sq RSS AIC
## <none> 1.6429e+12 4849.4
## - Lot.acres 1 2.7623e+10 1.6705e+12 4851.0
## - X..of.garage.spaces 1 2.9674e+10 1.6726e+12 4851.2
## - Lot.sqft 1 2.9938e+10 1.6729e+12 4851.3
## - Bedrooms 1 4.2224e+10 1.6851e+12 4852.8
## - X..of.carports 1 4.5178e+10 1.6881e+12 4853.2
## - Pool 1 1.1784e+11 1.7608e+12 4862.1
## - Type 1 1.3220e+11 1.7751e+12 4863.8
## - X..of.Fireplaces 1 1.4572e+11 1.7886e+12 4865.4
## - Year.built 1 2.4102e+11 1.8839e+12 4876.5
## - Main.house.sqft 1 7.4314e+11 2.3861e+12 4926.5
#CREATE MODEL GIVEN BY STEP FUNCTION
step_model_three <- lm(as.numeric(Listing.price)~ X..of.Fireplaces+
X..of.carports+
X..of.garage.spaces+
Status +
Pool +
Taxes +
Total.baths+
Type +
Bedrooms +
Year.built +
Lot.acres +
Main.house.sqft,data = train_four)
step_model_summary_three <- summary(step_model_three)
test_four$predicted_p <- predict(step_model_three,test_four)
step_model_Listing_price_mse <- mean((test_four$Listing.price - test_four$predicted_p)^2)
info_frame_two[3,1] <- step_model_Listing_price_mse
info_frame_two[3,2] <- sqrt(step_model_Listing_price_mse)
info_frame_two[3,3] <- step_model_summary_three$r.squared*100
info_frame_two[3,4] <- data_4$Zip[1]
info_frame_two[3,5] <- nrow(data_4)
info_frame_two[3,6] <- "AIC"
#NAME COLUMNS AND PLACE MODEL DATA INTO FRAME
names(info_frame_two) <- names
head(info_frame_two)
## MSE RMSE R^2 ZIP ROWS MODEL TYPE
## 1 254208859 15943.93 84.40917 85757 156 AIC
## 2 3695546028 60791.00 81.32775 85629 187 AIC
## 3 26517743603 162842.70 85.29671 85641 212 AIC
info_frame
## MSE RMSE R^2 ZIP ROWS MODEL TYPE
## 1 3249159471 57001.40 89.42592 85614 176 Non AIC
## 2 279512849 16718.64 85.26158 85757 156 Non AIC
## 3 2630064805 51284.16 86.07035 85629 187 Non AIC
## 4 5942036148 77084.60 78.61366 85641 212 Non AIC
## 5 4676032959 68381.52 81.45764 85653 252 Non AIC
## 6 203242275587 450824.00 85.18024 85658 132 Non AIC
## 7 1691652065 41129.70 93.95959 85710 122 Non AIC
## 8 16307548948 127701.01 88.02051 85743 125 Non AIC
## 9 1523734614 39035.04 92.75811 85747 145 Non AIC
## 10 91147167557 301905.89 86.66333 85755 132 Non AIC
#Author: Haley Bejarano
library(knitr)
library(tidyverse)
library(ggplot2)
library(ggthemes)
#This imports the data
RE_data <- read_csv("Housing_Data_Clean.csv")
# 1 - Scatter Plots for the listing price in the zip code it resides in
price_zip_type_plot <- ggplot(RE_data, aes(x = Zip, y = Listing.price/1000,
fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Listing Price in each Zipcode") +
xlab("Zip Code") + ylab("Current Lising Price (in thousands)") +
xlim(85600,85760) + facet_wrap(~Housing.type, scales = "free")
price_zip_type_plot

#reorder?
# 2 - Scatter Plots for the listing price for the year it was built in
price_year_type_plot <- ggplot(RE_data, aes(x = Year.built, y = Listing.price/1000,
color = Housing.type)) +
geom_point(alpha = .5) +
ggtitle("Average of House Listing Price in each Year when House was Built") +
xlab("Year Housing was Built") + ylab("Current Lising Price (in thousands)") +
xlim(1900,2024)
price_year_type_plot

# Turn into scatter lot
# 3 - Scatter Plots for the tax amounts in each school district
price_school_type_plot <- ggplot(RE_data, aes(x = School.district, y = Taxes,
fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of Housing Taxes in each School District") +
xlab("School District") + ylab("Current Lising Price (in thousands)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_colorblind()
price_school_type_plot

#facet_wrap & reorder
# 4 - These bar graphs see if pools are present pools by listing price by
# housing type
pool_price_type_plot <- RE_data %>%
ggplot(aes(x = Housing.type, y = Listing.price, fill = Pool)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
ggtitle("Average of Lisitng Price with Pools for Different Housing Types") +
xlab("Pool on Property") + ylab("Lising Price (in thousands)")
pool_price_type_plot

# 5 - Bar Plots for the listing price in the housing style it resides in
style_price_style_plot <- ggplot(RE_data, aes(x = House.style, y = Listing.price/1000,
fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Listing Price by Style of Home Design") +
xlab("Zip Code") + ylab("Current Lising Price (in thousands)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~Housing.type) + scale_fill_colorblind()
style_price_style_plot

# 6 - Scatter plot for the house square footage for the year it was built in
sqft_year_type_plot <- ggplot(RE_data, aes(x = Year.built, y = Main.house.sqft,
color = Housing.type)) +
geom_point(alpha = .5) +
ggtitle("Average of House Footage in each Year when House was Built") +
xlab("Year Built") + ylab("Main House Square Footage") +
xlim(1870,2024)
sqft_year_type_plot

# 7 - Bar Plots for the listing price by the number of bedrooms
price_bedroom_type_plot <- ggplot(RE_data, aes(x = Bedrooms, y = Listing.price/1000,
fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Listing Price by Number of Bedrooms") +
xlab("Number of Bedrooms") + ylab("Current Lising Price (in thousands)") +
scale_fill_colorblind()
price_bedroom_type_plot

# 8 - Bar Plots for the listing price by the number of total baths
price_bath_type_plot <- ggplot(RE_data, aes(x = Total.baths, y = Listing.price/1000,
fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Listing Price by Total Number of Baths") +
xlab("Total Number of Baths") + ylab("Current Lising Price (in thousands)") +
scale_fill_colorblind()
price_bath_type_plot

# 9 - Bar Plots for the housing taxes in the housing style it
# resides in
style_year_type_plot <- ggplot(RE_data, aes(x = reorder(House.style, Taxes),
y = Taxes, fill = Housing.type)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Taxes by Style of Home Design") +
xlab("Housing Style") + ylab("House taxes") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~Housing.type) + scale_fill_colorblind()
style_year_type_plot

# 10 - Bar Plots for the listing price of the house in the housing style it
# resides in
zip_price_style_plot <- ggplot(RE_data, aes(x = Zip, y = Listing.price/1000,
fill = House.style)) +
geom_bar(stat = "summary", fun = "mean") +
ggtitle("Average of House Listing Price by Style of Home Design") +
xlab("Zip Code") + ylab("Current Listing Price (in thousands)") +
xlim(85600,85760) + facet_wrap(~House.style, scales = "free")
zip_price_style_plot
