R Notebook

#getwd()

#read.csv("C:/Users/Nicolas Bertuleit/Documents/housing.csv")

#housing_data <-read.csv("C:/Users/Nicolas Bertuleit/Documents/housing.csv")
#We assigned a name to our dataset.

#housing_data <- subset(housing_data, select = -c(url, region_url,image_url,description))
#we deleted some columns that we didnt need for our analysis.

#write.csv(housing_data, "Housing_data2.csv", row.names = FALSE)
#re-write the dataset withouth the deleted columns

#housing_data2 <-read.csv("C:/Users/Nicolas Bertuleit/Documents/housing_data2.csv")

#View(housing_data2)

#str(housing_data2)
#structure of the data set(we ran it a few times, already modified)

#summary(housing_data2)

#housing_data2$dogs_allowed <- as.logical(housing_data2$dogs_allowed)
#We converted this variable into from boolean to logical

#housing_data2$cats_allowed <- as.logical(housing_data2$cats_allowed)
#We converted this variable into from boolean to logical

#housing_data2$smoking_allowed <- as.logical(housing_data2$smoking_allowed)
#We converted this variable into from boolean to logical

#housing_data2$wheelchair_access <- as.logical(housing_data2$wheelchair_access)
#We converted this variable into from boolean to logical

#housing_data2$comes_furnished <- as.logical(housing_data2$comes_furnished)
#We converted this variable into from boolean to logical

#housing_data2$electric_vehicle_charge <- as.logical(housing_data2$electric_vehicle_charge)

#summary(housing_data2)

#summary(housing_data2$price)
#to understand the values of price. The dataset included values extremely big compared to the mean and median of the rest of the dataset.

#range(housing_data2$price)

#quantile(housing_data2$price)

#housing_data2 <- subset(housing_data2, price <= 20000)
#To clean the data set, we eliminated the rows that included prices above $20.000

#range(housing_data2$price)

#housing_data2 <- subset(housing_data2, nchar(state) == 2)
##To clean the data set, we eliminated the rows that included states with more than 2 characters.

#summary(housing_data2)

#hist(housing_data2$price)
#to visualize the dsitribution of prices in our data set.

#boxplot(housing_data2$price)

#housing_data <- na.omit(housing_data)

#Q1 <- quantile(housing_data$price_per_sqft, 0.25)
#Q3 <- quantile(housing_data$price_per_sqft, 0.75)
#IQR <- Q3 - Q1
#lower_bound <- Q1 - 1.5 * IQR
#upper_bound <- Q3 + 1.5 * IQR
#housing_data <- housing_data[housing_data$price_per_sqft >= lower_bound & housing_data$price_per_sqft <= upper_bound, ]

#price_per_state <- aggregate(housing_data2$price, by = list(housing_data2$state), FUN = mean)
#Creating a variable of price per state for further analysis.

#colnames(price_per_state) <- c("state", "average_Price")

#price_per_state <- price_per_state[order(price_per_state
#                                        $average_Price), ]

#barplot(price_per_state$average_Price, names.arg = price_per_state$state, 
#        xlab = "state", ylab = "average Price", main = "average price by state",
#        col = "blue", las = 2) 

#Based on the analysis, we can conclude that the most expensive state to rent a property is Hawaii.

#housing_data2$price_per_sqft <- ifelse(housing_data2$sqfeet == 0, NA, housing_data2$price / housing_data2$sqfeet)
#for further analysis, we create a variable of price per state per square feet.

#price_per_sqft2 <- aggregate(housing_data2$price_per_sqft, by = list(housing_data2$state), FUN = mean, na.rm = TRUE)

#colnames(price_per_sqft2) <- c("state", "average_price_per_sqft")

#price_per_sqft2 <- price_per_sqft2[order(price_per_sqft2$average_price_per_sqft), ]

#barplot(price_per_sqft2$average_price_per_sqft, names.arg = price_per_sqft2$state, 
#        xlab = "State", ylab = "Average Price per sqft", main = "Average Price per sqft by #State",
#        col = "green", las = 2)
#based on the price per sqft, the most expensive state for rent is OH.

#sum(is.na(housing_data2$sqfeet))
#there are 3882 missing values in our dataset.

#plot(housing_data2$price, housing_data2$sqfeet)
#cor(housing_data2$price,housing_data2$sqfeet)

#table(housing_data2$state)

#cor(housing_data2[c("beds", "price", "baths", "sqfeet")])

#cor(housing_data2[c("beds", "price", "baths", "sqfeet","dogs_allowed")])

#install.packages("ggplot2")
#library(ggplot2)

#our.data <- data.frame(sqfeet = housing_data2$sqfeet, price_per_sqft = housing_data2$price_per_sqft)
#gg <- ggplot(our.data, aes(sqfeet, price_per_sqft))
#gg <- gg + geom_point()
#gg <- gg + geom_smooth(method = "lm", se=F, color="red")
#gg <- gg + ylim(0, 6)
#gg <- gg + xlim(500, 1500)
#gg <- gg + theme_bw()
#print(gg)
#the bigger the apartment, the lower the price per square feet.

#housing_data_filtered$pets_allowed <- ifelse(housing_data_filtered$cats_allowed == 1 | #housing_data_filtered$dogs_allowed == 1, 1, 0)

#we are going to clean the data type

#property_type <- table(housing_data$type)
#to_delete <- names(property_type[property_type < 100])
#housing_data_filtered <- subset(housing_data2, !(type %in% to_delete))
#nrow(housing_data_filtered)

#View(housing_data_filtered)

#install.packages("caret")
#library(caret)

#regiones <- c("ak", "az", "ca", "co", "hi", "id", "mt", "nv", "nm", "or", "ut", "wa", "wy")       # West
#regiones <- c(regiones, "il", "in", "ia", "ks", "mi", "mn", "mo", "ne", "nd", "oh", "sd", "wi")   # Midwest
#regiones <- c(regiones, "al", "ar", "de", "fl", "ga", "ky", "la", "md", "ms", "nc", "ok", "sc", "tn", "tx", "va", "wv")   # South
#regiones <- c(regiones, "ct", "me", "ma", "nh", "nj", "ny", "pa", "ri", "vt","dc")   # North

#asignar_region <- function(estado) {
#  for (i in 1:length(regiones)) {
#    if (tolower(estado) %in% regiones[i]) {
#      return(ifelse(i <= 13, "West", ifelse(i <= 26, "Midwest", ifelse(i <= 41, "South", #"North"))))
 #   }
#  }
# return("Desconocido")  # Por si acaso
#}

#housing_data_filtered$region <- sapply(housing_data_filtered$state, asignar_region)

#table(housing_data_filtered$region)

#View(housing_data_filtered)

#set.seed(123)  
#train_index <- createDataPartition(housing_data_filtered$price, p = 0.8, list = FALSE)
#train_data <- housing_data_filtered[train_index, ]
#test_data <- housing_data_filtered[-train_index, ]

#model <- lm(price ~ sqfeet + beds + baths + dogs_allowed + cats_allowed+ smoking_allowed + wheelchair_access + electric_vehicle_charge + comes_furnished + region , data = train_data)

#predictions <- predict(model, newdata = test_data)

#RMSE <- sqrt(mean((test_data$price - predictions)^2,na.rm = TRUE))
#print(paste("Root Mean Squared Error (RMSE):", RMSE))

#summary(model)

#set.seed(123)  
#train_index2 <- createDataPartition(housing_data_filtered$price, p = 0.8, list = FALSE)
#train_data2 <- housing_data_filtered[train_index2, ]
#test_data2 <- housing_data_filtered[-train_index2, ]

#model2 <- lm(price ~ sqfeet + beds + baths + region , data = train_data2)

#predictions2 <- predict(model, newdata = test_data2)

#RMSE <- sqrt(mean((test_data2$price - predictions2)^2,na.rm = TRUE))
#print(paste("Root Mean Squared Error (RMSE):", RMSE))

region <- "West"
sqfeet <- 1500
beds <- 3
baths <- 2

#nuevo_data <- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths)

#prediccion_precio3 <- predict(model2, newdata = nuevo_data)

#print(paste("The predicted price for a property in", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio3))

#install.packages("h2o")

#library(h2o)

#h2o.init()

#set.seed(0)  # Set seed for reproducibility
#n_samples_train <- 1000  # Size of the training dataset
#n_samples_test <- 200    # Size of the testing dataset

#mean_price_per_sqft <- mean(housing_data_filtered$price_per_sqft)
#sd_price_per_sqft <- sd(housing_data_filtered$price_per_sqft)

#price_per_square_feet <- rnorm(n_samples_train, mean = mean_price_per_sqft, sd = #sd_price_per_sqft)
#region <- sample(c("West", "Midwest", "South", "North"), n_samples_train, replace = TRUE)
#beds <- sample(1:5, n_samples_train, replace = TRUE)
#baths <- sample(1:3, n_samples_train, replace = TRUE)

#train_df <- data.frame(price_per_square_feet, region, beds, baths)

#train_h2o <- as.h2o(train_df)

#if (!requireNamespace("rpart", quietly = TRUE)) {
#install.packages("rpart")
#}
#library(rpart)

#modelo_arbol <- rpart(price ~ ., data = housing_data_filtered)

#print(modelo_arbol)
#h2o.shutdown

#predicciones <- predict(modelo_arbol, newdata = housing_data_filtered)

#head(predicciones)

#modelo <- lm(price_per_sqft ~ type, data = housing_data_filtered)

#anova_resultado <- anova(modelo)

#print(anova_resultado)

#modelo_regresion <- lm(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)

#summary(modelo_regresion)

#install.packages("rpart")

#library(rpart)

#modelo_arbol <- rpart(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)

#plot(modelo_arbol)
#text(modelo_arbol)

#install.packages("randomForest")
#library(randomForest)

#housing_data_filtered <- na.omit(housing_data_filtered)

#set.seed(123) # Para reproducibilidad

#modelo_bosque2 <- randomForest(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)

#varImpPlot(modelo_bosque2)

#Type of property has a greater influence in the price per square feet, followed by beds and region.

#tabla_contingencia <- table(housing_data_filtered$type, housing_data_filtered$region)

#prueba_chi <- chisq.test(tabla_contingencia)

#print(prueba_chi)
#We reject the null hipothesis, and we can conclude that there is a strong relationship between Type and region.

#correlaciones <- cor(housing_data_filtered[c("price_per_sqft", "beds", "baths")])

#print(correlaciones)

#install.packages("rpart")
#library(rpart)

#modelo_arbol <- rpart(price ~ region + type + beds + baths, data = housing_data_filtered)

#plot(modelo_arbol)
#text(modelo_arbol)

#housing_data_filtered$price_per_sqft <- ifelse(housing_data_filtered$sqfeet == 0, NA, housing_data_filtered$price / housing_data_filtered$sqfeet)
#for further analysis, we create a variable of price per state per square feet.

#price_per_sqft3 <- aggregate(housing_data_filtered$price_per_sqft, by = list(housing_data_filtered$type), FUN = mean, na.rm = TRUE)

#colnames(price_per_sqft3) <- c("type", "average_price_per_sqft")

#price_per_sqft3 <- price_per_sqft3[order(price_per_sqft3$average_price_per_sqft), ]

#barplot(price_per_sqft3$average_price_per_sqft, names.arg = price_per_sqft3$type, 
#       xlab = "Type", ylab = "Average Price per sqft", main = "Average Price per sqft by #Type",
#        col = "green", las = 2)
#based on the price per sqft, the most expensive state for rent is OH.

#new_model2 <- lm(price ~ sqfeet + beds + baths , data = housing_data_filtered)

#predict(new_model2,
#        data.frame(sqfeet = 970, beds = 2 , baths = 2 ,
#                   type = "condo" , region = "West", pets_allowed = 1))

#summary()

#View(housing_data_filtered)

#new_model3 <- lm(price ~ sqfeet + beds + baths + type + region + pets_allowed + comes_furnished , data = housing_data_filtered)

#summary(new_model3)

#housing_data_filtered$pred <- predict(new_model3, housing_data_filtered)
#cor(housing_data_filtered$pred, housing_data_filtered$price)

#predict(new_model3,
#        data.frame(sqfeet = 970, beds = 2 , baths = 2 ,
#                   type = "condo" , region = "West",comes_furnished = TRUE, pets_allowed = # 1))

#comparing model2 and model 3, the last model with an adjusted R-squared seems to predict a #price more accurately.

#region <- "Midwest"
#sqfeet <- 1741
#baths <- 2.5
#type <- "condo"
#beds <- 3
#comes_furnished <- FALSE
#pets_allowed <- 0

#new_data2<- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths, comes_furnished = comes_furnished, pets_allowed = pets_allowed, type = type )

#prediccion_precio4 <- predict(new_model3, newdata = new_data2)
#print(prediccion_precio4)

#print(paste("The predicted price for a ",type," in ", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio4))

#region <- "Midwest"
#sqfeet <- 1741
#baths <- 2.5
#beds <- 3

#new_data3<- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths)

#prediccion_precio5 <- predict(new_model2, newdata = new_data3)
#print(prediccion_precio5)

#print(paste("The predicted price for a property in ", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio5))

#View(housing_data_filtered)