#The problem we are working on is we are looking to predict house price data in King County
#Our objective is to build a model to accurately predict house prices for King Country homes for Jacob Kawalski and better help him understand the local #real estate market
#The data is a random sample of King County homes and their relevant sales data. This includes in total the ID, year, month, day, day of week, price, #number of bedrooms & bathrooms, the square footage of the home & lot, the number of floors of the house, whether or not the house has a waterfront  #view, the number of times the house has been viewed, the condition of the house, the grade of the house based on the King County rating scale, the #square footage of the house minus the basement & the basement itself, the year the house was built, the year the house was renovated, the zip code of #the house, the latitude and longitude of the house, and the living room area and lot size in 2015 to gauge renovations
#We focused on the year the house was sold, the price as our target variable, the number of bedrooms & bathrooms, the square footage of the house & lot, #whether or not the house is a waterfront property, the grade from the King County rating scale, the year the house was built & renovated (if relevant), #the zip code, and the square footage of the living room and lot in 2015 because we felt these variables would best predict the price of a home and #correlated well with price without being too many to cause concern of over fitting the data to the model
#With a low p value and an RSME which is lower in the validation set then it is in the training set, we believe that this regression model is providing us with fairly accurate predictions of the house values given our input variables.
#The predictions range from just under $200,000 to over $325,000 which from our understanding is fairly accurate for the Seattle real estate market circa 2014-15.
#load in data
library(rpart)
library(rpart.plot)
library(forecast)
library(tidyr)
library(ROSE)
library(corrgram)
library(ggplot2)
library(ggpubr)
library(forecast)
house <- read.csv("house_8.csv", header = TRUE)
names(house)
#examine data types
str(house)
#remove empty values
house_clean <- na.omit(house)
#trim down data to relevant values
house_final <- house[, -c(2,4:6, 12, 14:15, 17:18, 22:23)]
#view final data types
str(house_final)
set.seed(666)
#Randomly sample the rows via their indices
train_index <-sample(1:nrow(house_final), 0.6*nrow(house_final))
valid_index <-setdiff(1:nrow(house_final), train_index)
train_df <- house_final[train_index, ]
valid_df <- house_final[valid_index, ]
nrow(train_df)
nrow(valid_df)
corrgram(train_df)
price_model <- lm(price ~ ., data = train_df)
summary(price_model)
# We chose to implement a multiple linear regression for our model to predict King County home prices. We chose this because regression analysis has advantages in being able to determine relative impacts of multiple variables on the target variable. The model is significant because the p value: p < .01
price_model_pred <- predict(price_model, valid_df)
accuracy(price_model_pred, valid_df$price)
sd(valid_df$price)
price_model_pred2 <- predict(price_model, train_df)
accuracy(price_model_pred2, train_df$price)
sd(train_df$price)
# RSME is lower in the validation data then the trainig data, which is a good indicator. Overall our model has suitable accuracy, is statistically significant, and can be validated by our RMSE values as well.
# Create a new record
house <-data.frame(X =1,
                     Year =2014,
                     bedrooms =3,
                     bathrooms =1.75,
                     sqft_living =1060,
                     sqft_lot =38644,
                     waterfront =0,
                     grade =7,
                     yr_built =1983,
                     yr_renovated =0,
                     zipcode =98077,
                     sqft_living15 =1310,
                     sqft_lot15 =11416)
# Create a new record
house2 <-data.frame(X =2,
                     Year =2014,
                     bedrooms =2,
                     bathrooms =1,
                     sqft_living =1220,
                     sqft_lot =5040,
                     waterfront =0,
                     grade =7,
                     yr_built =1961,
                     yr_renovated =0,
                     zipcode =98117,
                     sqft_living15 =1420,
                     sqft_lot15 =5040)
# Create a new record
house3 <-data.frame(X =3,
                     Year =2014,
                     bedrooms =3,
                     bathrooms =1,
                     sqft_living =1240,
                     sqft_lot =7300,
                     waterfront =0,
                     grade =7,
                     yr_built =1968,
                     yr_renovated =0,
                     zipcode =98033,
                     sqft_living15 =1240,
                     sqft_lot15 =8260)
# Predicting price of house 1
house_pred <- predict(price_model, house)
house_pred
#the price prediction for house 1 is $195,537.8
# Predicting price of house 2
house2_pred <- predict(price_model, house2)
house2_pred
#the price prediction for house 2 is $325,152.3
# Predicting price of house 3
house3_pred <- predict(price_model, house3)
house3_pred
#the price prediction for house 3 is $251,454.8
LS0tCnRpdGxlOiAiUHJvamVjdCBQcm9ibGVtIDMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQojVGhlIHByb2JsZW0gd2UgYXJlIHdvcmtpbmcgb24gaXMgd2UgYXJlIGxvb2tpbmcgdG8gcHJlZGljdCBob3VzZSBwcmljZSBkYXRhIGluIEtpbmcgQ291bnR5CmBgYAoKYGBge3J9CiNPdXIgb2JqZWN0aXZlIGlzIHRvIGJ1aWxkIGEgbW9kZWwgdG8gYWNjdXJhdGVseSBwcmVkaWN0IGhvdXNlIHByaWNlcyBmb3IgS2luZyBDb3VudHJ5IGhvbWVzIGZvciBKYWNvYiBLYXdhbHNraSBhbmQgYmV0dGVyIGhlbHAgaGltIHVuZGVyc3RhbmQgdGhlIGxvY2FsICNyZWFsIGVzdGF0ZSBtYXJrZXQKYGBgCgpgYGB7cn0KI1RoZSBkYXRhIGlzIGEgcmFuZG9tIHNhbXBsZSBvZiBLaW5nIENvdW50eSBob21lcyBhbmQgdGhlaXIgcmVsZXZhbnQgc2FsZXMgZGF0YS4gVGhpcyBpbmNsdWRlcyBpbiB0b3RhbCB0aGUgSUQsIHllYXIsIG1vbnRoLCBkYXksIGRheSBvZiB3ZWVrLCBwcmljZSwgI251bWJlciBvZiBiZWRyb29tcyAmIGJhdGhyb29tcywgdGhlIHNxdWFyZSBmb290YWdlIG9mIHRoZSBob21lICYgbG90LCB0aGUgbnVtYmVyIG9mIGZsb29ycyBvZiB0aGUgaG91c2UsIHdoZXRoZXIgb3Igbm90IHRoZSBob3VzZSBoYXMgYSB3YXRlcmZyb250ICAjdmlldywgdGhlIG51bWJlciBvZiB0aW1lcyB0aGUgaG91c2UgaGFzIGJlZW4gdmlld2VkLCB0aGUgY29uZGl0aW9uIG9mIHRoZSBob3VzZSwgdGhlIGdyYWRlIG9mIHRoZSBob3VzZSBiYXNlZCBvbiB0aGUgS2luZyBDb3VudHkgcmF0aW5nIHNjYWxlLCB0aGUgI3NxdWFyZSBmb290YWdlIG9mIHRoZSBob3VzZSBtaW51cyB0aGUgYmFzZW1lbnQgJiB0aGUgYmFzZW1lbnQgaXRzZWxmLCB0aGUgeWVhciB0aGUgaG91c2Ugd2FzIGJ1aWx0LCB0aGUgeWVhciB0aGUgaG91c2Ugd2FzIHJlbm92YXRlZCwgdGhlIHppcCBjb2RlIG9mICN0aGUgaG91c2UsIHRoZSBsYXRpdHVkZSBhbmQgbG9uZ2l0dWRlIG9mIHRoZSBob3VzZSwgYW5kIHRoZSBsaXZpbmcgcm9vbSBhcmVhIGFuZCBsb3Qgc2l6ZSBpbiAyMDE1IHRvIGdhdWdlIHJlbm92YXRpb25zCmBgYAoKYGBge3J9CiNXZSBmb2N1c2VkIG9uIHRoZSB5ZWFyIHRoZSBob3VzZSB3YXMgc29sZCwgdGhlIHByaWNlIGFzIG91ciB0YXJnZXQgdmFyaWFibGUsIHRoZSBudW1iZXIgb2YgYmVkcm9vbXMgJiBiYXRocm9vbXMsIHRoZSBzcXVhcmUgZm9vdGFnZSBvZiB0aGUgaG91c2UgJiBsb3QsICN3aGV0aGVyIG9yIG5vdCB0aGUgaG91c2UgaXMgYSB3YXRlcmZyb250IHByb3BlcnR5LCB0aGUgZ3JhZGUgZnJvbSB0aGUgS2luZyBDb3VudHkgcmF0aW5nIHNjYWxlLCB0aGUgeWVhciB0aGUgaG91c2Ugd2FzIGJ1aWx0ICYgcmVub3ZhdGVkIChpZiByZWxldmFudCksICN0aGUgemlwIGNvZGUsIGFuZCB0aGUgc3F1YXJlIGZvb3RhZ2Ugb2YgdGhlIGxpdmluZyByb29tIGFuZCBsb3QgaW4gMjAxNSBiZWNhdXNlIHdlIGZlbHQgdGhlc2UgdmFyaWFibGVzIHdvdWxkIGJlc3QgcHJlZGljdCB0aGUgcHJpY2Ugb2YgYSBob21lIGFuZCAjY29ycmVsYXRlZCB3ZWxsIHdpdGggcHJpY2Ugd2l0aG91dCBiZWluZyB0b28gbWFueSB0byBjYXVzZSBjb25jZXJuIG9mIG92ZXIgZml0dGluZyB0aGUgZGF0YSB0byB0aGUgbW9kZWwKYGBgCgpgYGB7cn0KI1dpdGggYSBsb3cgcCB2YWx1ZSBhbmQgYW4gUlNNRSB3aGljaCBpcyBsb3dlciBpbiB0aGUgdmFsaWRhdGlvbiBzZXQgdGhlbiBpdCBpcyBpbiB0aGUgdHJhaW5pbmcgc2V0LCB3ZSBiZWxpZXZlIHRoYXQgdGhpcyByZWdyZXNzaW9uIG1vZGVsIGlzIHByb3ZpZGluZyB1cyB3aXRoIGZhaXJseSBhY2N1cmF0ZSBwcmVkaWN0aW9ucyBvZiB0aGUgaG91c2UgdmFsdWVzIGdpdmVuIG91ciBpbnB1dCB2YXJpYWJsZXMuCiNUaGUgcHJlZGljdGlvbnMgcmFuZ2UgZnJvbSBqdXN0IHVuZGVyICQyMDAsMDAwIHRvIG92ZXIgJDMyNSwwMDAgd2hpY2ggZnJvbSBvdXIgdW5kZXJzdGFuZGluZyBpcyBmYWlybHkgYWNjdXJhdGUgZm9yIHRoZSBTZWF0dGxlIHJlYWwgZXN0YXRlIG1hcmtldCBjaXJjYSAyMDE0LTE1LgpgYGAKCmBgYHtyfQojbG9hZCBpbiBkYXRhCmxpYnJhcnkocnBhcnQpCmxpYnJhcnkocnBhcnQucGxvdCkKbGlicmFyeShmb3JlY2FzdCkKbGlicmFyeSh0aWR5cikKbGlicmFyeShST1NFKQpsaWJyYXJ5KGNvcnJncmFtKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoZ2dwdWJyKQpsaWJyYXJ5KGZvcmVjYXN0KQpob3VzZSA8LSByZWFkLmNzdigiaG91c2VfOC5jc3YiLCBoZWFkZXIgPSBUUlVFKQpuYW1lcyhob3VzZSkKYGBgCmBgYHtyfQojZXhhbWluZSBkYXRhIHR5cGVzCnN0cihob3VzZSkKYGBgCgpgYGB7cn0KI3JlbW92ZSBlbXB0eSB2YWx1ZXMKaG91c2VfY2xlYW4gPC0gbmEub21pdChob3VzZSkKYGBgCgpgYGB7cn0KI3RyaW0gZG93biBkYXRhIHRvIHJlbGV2YW50IHZhbHVlcwpob3VzZV9maW5hbCA8LSBob3VzZVssIC1jKDIsNDo2LCAxMiwgMTQ6MTUsIDE3OjE4LCAyMjoyMyldCmBgYAoKYGBge3J9CiN2aWV3IGZpbmFsIGRhdGEgdHlwZXMKc3RyKGhvdXNlX2ZpbmFsKQpgYGAKYGBge3J9CnNldC5zZWVkKDY2NikKI1JhbmRvbWx5IHNhbXBsZSB0aGUgcm93cyB2aWEgdGhlaXIgaW5kaWNlcwp0cmFpbl9pbmRleCA8LXNhbXBsZSgxOm5yb3coaG91c2VfZmluYWwpLCAwLjYqbnJvdyhob3VzZV9maW5hbCkpCnZhbGlkX2luZGV4IDwtc2V0ZGlmZigxOm5yb3coaG91c2VfZmluYWwpLCB0cmFpbl9pbmRleCkKdHJhaW5fZGYgPC0gaG91c2VfZmluYWxbdHJhaW5faW5kZXgsIF0KdmFsaWRfZGYgPC0gaG91c2VfZmluYWxbdmFsaWRfaW5kZXgsIF0KbnJvdyh0cmFpbl9kZikKbnJvdyh2YWxpZF9kZikKY29ycmdyYW0odHJhaW5fZGYpCmBgYAoKYGBge3J9CnByaWNlX21vZGVsIDwtIGxtKHByaWNlIH4gLiwgZGF0YSA9IHRyYWluX2RmKQpzdW1tYXJ5KHByaWNlX21vZGVsKQojIFdlIGNob3NlIHRvIGltcGxlbWVudCBhIG11bHRpcGxlIGxpbmVhciByZWdyZXNzaW9uIGZvciBvdXIgbW9kZWwgdG8gcHJlZGljdCBLaW5nIENvdW50eSBob21lIHByaWNlcy4gV2UgY2hvc2UgdGhpcyBiZWNhdXNlIHJlZ3Jlc3Npb24gYW5hbHlzaXMgaGFzIGFkdmFudGFnZXMgaW4gYmVpbmcgYWJsZSB0byBkZXRlcm1pbmUgcmVsYXRpdmUgaW1wYWN0cyBvZiBtdWx0aXBsZSB2YXJpYWJsZXMgb24gdGhlIHRhcmdldCB2YXJpYWJsZS4gVGhlIG1vZGVsIGlzIHNpZ25pZmljYW50IGJlY2F1c2UgdGhlIHAgdmFsdWU6IHAgPCAuMDEKYGBgCgpgYGB7cn0KcHJpY2VfbW9kZWxfcHJlZCA8LSBwcmVkaWN0KHByaWNlX21vZGVsLCB2YWxpZF9kZikKYWNjdXJhY3kocHJpY2VfbW9kZWxfcHJlZCwgdmFsaWRfZGYkcHJpY2UpCnNkKHZhbGlkX2RmJHByaWNlKQpgYGAKCmBgYHtyfQpwcmljZV9tb2RlbF9wcmVkMiA8LSBwcmVkaWN0KHByaWNlX21vZGVsLCB0cmFpbl9kZikKYWNjdXJhY3kocHJpY2VfbW9kZWxfcHJlZDIsIHRyYWluX2RmJHByaWNlKQpzZCh0cmFpbl9kZiRwcmljZSkKIyBSU01FIGlzIGxvd2VyIGluIHRoZSB2YWxpZGF0aW9uIGRhdGEgdGhlbiB0aGUgdHJhaW5pZyBkYXRhLCB3aGljaCBpcyBhIGdvb2QgaW5kaWNhdG9yLiBPdmVyYWxsIG91ciBtb2RlbCBoYXMgc3VpdGFibGUgYWNjdXJhY3ksIGlzIHN0YXRpc3RpY2FsbHkgc2lnbmlmaWNhbnQsIGFuZCBjYW4gYmUgdmFsaWRhdGVkIGJ5IG91ciBSTVNFIHZhbHVlcyBhcyB3ZWxsLgpgYGAKCmBgYHtyfQojIENyZWF0ZSBhIG5ldyByZWNvcmQKaG91c2UgPC1kYXRhLmZyYW1lKFggPTEsCiAgICAgICAgICAgICAgICAgICAgIFllYXIgPTIwMTQsCiAgICAgICAgICAgICAgICAgICAgIGJlZHJvb21zID0zLAogICAgICAgICAgICAgICAgICAgICBiYXRocm9vbXMgPTEuNzUsCiAgICAgICAgICAgICAgICAgICAgIHNxZnRfbGl2aW5nID0xMDYwLAogICAgICAgICAgICAgICAgICAgICBzcWZ0X2xvdCA9Mzg2NDQsCiAgICAgICAgICAgICAgICAgICAgIHdhdGVyZnJvbnQgPTAsCiAgICAgICAgICAgICAgICAgICAgIGdyYWRlID03LAogICAgICAgICAgICAgICAgICAgICB5cl9idWlsdCA9MTk4MywKICAgICAgICAgICAgICAgICAgICAgeXJfcmVub3ZhdGVkID0wLAogICAgICAgICAgICAgICAgICAgICB6aXBjb2RlID05ODA3NywKICAgICAgICAgICAgICAgICAgICAgc3FmdF9saXZpbmcxNSA9MTMxMCwKICAgICAgICAgICAgICAgICAgICAgc3FmdF9sb3QxNSA9MTE0MTYpCmBgYAoKYGBge3J9CiMgQ3JlYXRlIGEgbmV3IHJlY29yZApob3VzZTIgPC1kYXRhLmZyYW1lKFggPTIsCiAgICAgICAgICAgICAgICAgICAgIFllYXIgPTIwMTQsCiAgICAgICAgICAgICAgICAgICAgIGJlZHJvb21zID0yLAogICAgICAgICAgICAgICAgICAgICBiYXRocm9vbXMgPTEsCiAgICAgICAgICAgICAgICAgICAgIHNxZnRfbGl2aW5nID0xMjIwLAogICAgICAgICAgICAgICAgICAgICBzcWZ0X2xvdCA9NTA0MCwKICAgICAgICAgICAgICAgICAgICAgd2F0ZXJmcm9udCA9MCwKICAgICAgICAgICAgICAgICAgICAgZ3JhZGUgPTcsCiAgICAgICAgICAgICAgICAgICAgIHlyX2J1aWx0ID0xOTYxLAogICAgICAgICAgICAgICAgICAgICB5cl9yZW5vdmF0ZWQgPTAsCiAgICAgICAgICAgICAgICAgICAgIHppcGNvZGUgPTk4MTE3LAogICAgICAgICAgICAgICAgICAgICBzcWZ0X2xpdmluZzE1ID0xNDIwLAogICAgICAgICAgICAgICAgICAgICBzcWZ0X2xvdDE1ID01MDQwKQpgYGAKCmBgYHtyfQojIENyZWF0ZSBhIG5ldyByZWNvcmQKaG91c2UzIDwtZGF0YS5mcmFtZShYID0zLAogICAgICAgICAgICAgICAgICAgICBZZWFyID0yMDE0LAogICAgICAgICAgICAgICAgICAgICBiZWRyb29tcyA9MywKICAgICAgICAgICAgICAgICAgICAgYmF0aHJvb21zID0xLAogICAgICAgICAgICAgICAgICAgICBzcWZ0X2xpdmluZyA9MTI0MCwKICAgICAgICAgICAgICAgICAgICAgc3FmdF9sb3QgPTczMDAsCiAgICAgICAgICAgICAgICAgICAgIHdhdGVyZnJvbnQgPTAsCiAgICAgICAgICAgICAgICAgICAgIGdyYWRlID03LAogICAgICAgICAgICAgICAgICAgICB5cl9idWlsdCA9MTk2OCwKICAgICAgICAgICAgICAgICAgICAgeXJfcmVub3ZhdGVkID0wLAogICAgICAgICAgICAgICAgICAgICB6aXBjb2RlID05ODAzMywKICAgICAgICAgICAgICAgICAgICAgc3FmdF9saXZpbmcxNSA9MTI0MCwKICAgICAgICAgICAgICAgICAgICAgc3FmdF9sb3QxNSA9ODI2MCkKYGBgCgpgYGB7cn0KIyBQcmVkaWN0aW5nIHByaWNlIG9mIGhvdXNlIDEKaG91c2VfcHJlZCA8LSBwcmVkaWN0KHByaWNlX21vZGVsLCBob3VzZSkKaG91c2VfcHJlZAojdGhlIHByaWNlIHByZWRpY3Rpb24gZm9yIGhvdXNlIDEgaXMgJDE5NSw1MzcuOApgYGAKCmBgYHtyfQojIFByZWRpY3RpbmcgcHJpY2Ugb2YgaG91c2UgMgpob3VzZTJfcHJlZCA8LSBwcmVkaWN0KHByaWNlX21vZGVsLCBob3VzZTIpCmhvdXNlMl9wcmVkCiN0aGUgcHJpY2UgcHJlZGljdGlvbiBmb3IgaG91c2UgMiBpcyAkMzI1LDE1Mi4zCmBgYAoKYGBge3J9CiMgUHJlZGljdGluZyBwcmljZSBvZiBob3VzZSAzCmhvdXNlM19wcmVkIDwtIHByZWRpY3QocHJpY2VfbW9kZWwsIGhvdXNlMykKaG91c2UzX3ByZWQKI3RoZSBwcmljZSBwcmVkaWN0aW9uIGZvciBob3VzZSAzIGlzICQyNTEsNDU0LjgKYGBgCg==