#getwd()
#read.csv("C:/Users/Nicolas Bertuleit/Documents/housing.csv")
#housing_data <-read.csv("C:/Users/Nicolas Bertuleit/Documents/housing.csv")
#We assigned a name to our dataset.
#housing_data <- subset(housing_data, select = -c(url, region_url,image_url,description))
#we deleted some columns that we didnt need for our analysis.
#write.csv(housing_data, "Housing_data2.csv", row.names = FALSE)
#re-write the dataset withouth the deleted columns
#housing_data2 <-read.csv("C:/Users/Nicolas Bertuleit/Documents/housing_data2.csv")
#View(housing_data2)
#str(housing_data2)
#structure of the data set(we ran it a few times, already modified)
#summary(housing_data2)
#housing_data2$dogs_allowed <- as.logical(housing_data2$dogs_allowed)
#We converted this variable into from boolean to logical
#housing_data2$cats_allowed <- as.logical(housing_data2$cats_allowed)
#We converted this variable into from boolean to logical
#housing_data2$smoking_allowed <- as.logical(housing_data2$smoking_allowed)
#We converted this variable into from boolean to logical
#housing_data2$wheelchair_access <- as.logical(housing_data2$wheelchair_access)
#We converted this variable into from boolean to logical
#housing_data2$comes_furnished <- as.logical(housing_data2$comes_furnished)
#We converted this variable into from boolean to logical
#housing_data2$electric_vehicle_charge <- as.logical(housing_data2$electric_vehicle_charge)
#summary(housing_data2)
#summary(housing_data2$price)
#to understand the values of price. The dataset included values extremely big compared to the mean and median of the rest of the dataset. 
#range(housing_data2$price)
#quantile(housing_data2$price)
#housing_data2 <- subset(housing_data2, price <= 20000)
#To clean the data set, we eliminated the rows that included prices above $20.000
#range(housing_data2$price)
#housing_data2 <- subset(housing_data2, nchar(state) == 2)
##To clean the data set, we eliminated the rows that included states with more than 2 characters.
#summary(housing_data2)
#hist(housing_data2$price)
#to visualize the dsitribution of prices in our data set.
#boxplot(housing_data2$price)
#housing_data <- na.omit(housing_data)
#Q1 <- quantile(housing_data$price_per_sqft, 0.25)
#Q3 <- quantile(housing_data$price_per_sqft, 0.75)
#IQR <- Q3 - Q1
#lower_bound <- Q1 - 1.5 * IQR
#upper_bound <- Q3 + 1.5 * IQR
#housing_data <- housing_data[housing_data$price_per_sqft >= lower_bound & housing_data$price_per_sqft <= upper_bound, ]
#price_per_state <- aggregate(housing_data2$price, by = list(housing_data2$state), FUN = mean)
#Creating a variable of price per state for further analysis. 
#colnames(price_per_state) <- c("state", "average_Price")
#price_per_state <- price_per_state[order(price_per_state
#                                        $average_Price), ]
#barplot(price_per_state$average_Price, names.arg = price_per_state$state, 
#        xlab = "state", ylab = "average Price", main = "average price by state",
#        col = "blue", las = 2) 

#Based on the analysis, we can conclude that the most expensive state to rent a property is Hawaii. 
#housing_data2$price_per_sqft <- ifelse(housing_data2$sqfeet == 0, NA, housing_data2$price / housing_data2$sqfeet)
#for further analysis, we create a variable of price per state per square feet. 
#price_per_sqft2 <- aggregate(housing_data2$price_per_sqft, by = list(housing_data2$state), FUN = mean, na.rm = TRUE)
#colnames(price_per_sqft2) <- c("state", "average_price_per_sqft")
#price_per_sqft2 <- price_per_sqft2[order(price_per_sqft2$average_price_per_sqft), ]
#barplot(price_per_sqft2$average_price_per_sqft, names.arg = price_per_sqft2$state, 
#        xlab = "State", ylab = "Average Price per sqft", main = "Average Price per sqft by #State",
#        col = "green", las = 2)
#based on the price per sqft, the most expensive state for rent is OH. 
#sum(is.na(housing_data2$sqfeet))
#there are 3882 missing values in our dataset.
#plot(housing_data2$price, housing_data2$sqfeet)
#cor(housing_data2$price,housing_data2$sqfeet)
#table(housing_data2$state)
#cor(housing_data2[c("beds", "price", "baths", "sqfeet")])
#cor(housing_data2[c("beds", "price", "baths", "sqfeet","dogs_allowed")])
#install.packages("ggplot2")
#library(ggplot2)
#our.data <- data.frame(sqfeet = housing_data2$sqfeet, price_per_sqft = housing_data2$price_per_sqft)
#gg <- ggplot(our.data, aes(sqfeet, price_per_sqft))
#gg <- gg + geom_point()
#gg <- gg + geom_smooth(method = "lm", se=F, color="red")
#gg <- gg + ylim(0, 6)
#gg <- gg + xlim(500, 1500)
#gg <- gg + theme_bw()
#print(gg)
#the bigger the apartment, the lower the price per square feet. 
#housing_data_filtered$pets_allowed <- ifelse(housing_data_filtered$cats_allowed == 1 | #housing_data_filtered$dogs_allowed == 1, 1, 0)
#we are going to clean the data type

#property_type <- table(housing_data$type)
#to_delete <- names(property_type[property_type < 100])
#housing_data_filtered <- subset(housing_data2, !(type %in% to_delete))
#nrow(housing_data_filtered)
#View(housing_data_filtered)
#install.packages("caret")
#library(caret)
#regiones <- c("ak", "az", "ca", "co", "hi", "id", "mt", "nv", "nm", "or", "ut", "wa", "wy")       # West
#regiones <- c(regiones, "il", "in", "ia", "ks", "mi", "mn", "mo", "ne", "nd", "oh", "sd", "wi")   # Midwest
#regiones <- c(regiones, "al", "ar", "de", "fl", "ga", "ky", "la", "md", "ms", "nc", "ok", "sc", "tn", "tx", "va", "wv")   # South
#regiones <- c(regiones, "ct", "me", "ma", "nh", "nj", "ny", "pa", "ri", "vt","dc")   # North
#asignar_region <- function(estado) {
#  for (i in 1:length(regiones)) {
#    if (tolower(estado) %in% regiones[i]) {
#      return(ifelse(i <= 13, "West", ifelse(i <= 26, "Midwest", ifelse(i <= 41, "South", #"North"))))
 #   }
#  }
# return("Desconocido")  # Por si acaso
#}
#housing_data_filtered$region <- sapply(housing_data_filtered$state, asignar_region)
#table(housing_data_filtered$region)
#View(housing_data_filtered)
#set.seed(123)  
#train_index <- createDataPartition(housing_data_filtered$price, p = 0.8, list = FALSE)
#train_data <- housing_data_filtered[train_index, ]
#test_data <- housing_data_filtered[-train_index, ]
#model <- lm(price ~ sqfeet + beds + baths + dogs_allowed + cats_allowed+ smoking_allowed + wheelchair_access + electric_vehicle_charge + comes_furnished + region , data = train_data)
#predictions <- predict(model, newdata = test_data)
#RMSE <- sqrt(mean((test_data$price - predictions)^2,na.rm = TRUE))
#print(paste("Root Mean Squared Error (RMSE):", RMSE))
#summary(model)
#set.seed(123)  
#train_index2 <- createDataPartition(housing_data_filtered$price, p = 0.8, list = FALSE)
#train_data2 <- housing_data_filtered[train_index2, ]
#test_data2 <- housing_data_filtered[-train_index2, ]
#model2 <- lm(price ~ sqfeet + beds + baths + region , data = train_data2)
#predictions2 <- predict(model, newdata = test_data2)
#RMSE <- sqrt(mean((test_data2$price - predictions2)^2,na.rm = TRUE))
#print(paste("Root Mean Squared Error (RMSE):", RMSE))
region <- "West"
sqfeet <- 1500
beds <- 3
baths <- 2
#nuevo_data <- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths)
#prediccion_precio3 <- predict(model2, newdata = nuevo_data)
#print(paste("The predicted price for a property in", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio3))
#install.packages("h2o")
#library(h2o)
#h2o.init()
#set.seed(0)  # Set seed for reproducibility
#n_samples_train <- 1000  # Size of the training dataset
#n_samples_test <- 200    # Size of the testing dataset
#mean_price_per_sqft <- mean(housing_data_filtered$price_per_sqft)
#sd_price_per_sqft <- sd(housing_data_filtered$price_per_sqft)
#price_per_square_feet <- rnorm(n_samples_train, mean = mean_price_per_sqft, sd = #sd_price_per_sqft)
#region <- sample(c("West", "Midwest", "South", "North"), n_samples_train, replace = TRUE)
#beds <- sample(1:5, n_samples_train, replace = TRUE)
#baths <- sample(1:3, n_samples_train, replace = TRUE)
#train_df <- data.frame(price_per_square_feet, region, beds, baths)
#train_h2o <- as.h2o(train_df)
#if (!requireNamespace("rpart", quietly = TRUE)) {
#install.packages("rpart")
#}
#library(rpart)
#modelo_arbol <- rpart(price ~ ., data = housing_data_filtered)
#print(modelo_arbol)
#h2o.shutdown
#predicciones <- predict(modelo_arbol, newdata = housing_data_filtered)
#head(predicciones)
#modelo <- lm(price_per_sqft ~ type, data = housing_data_filtered)
#anova_resultado <- anova(modelo)
#print(anova_resultado)
#modelo_regresion <- lm(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)
#summary(modelo_regresion)
#install.packages("rpart")
#library(rpart)
#modelo_arbol <- rpart(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)
#plot(modelo_arbol)
#text(modelo_arbol)
#install.packages("randomForest")
#library(randomForest)
#housing_data_filtered <- na.omit(housing_data_filtered)
#set.seed(123) # Para reproducibilidad
#modelo_bosque2 <- randomForest(price_per_sqft ~ type + beds + baths + region, data = housing_data_filtered)
#varImpPlot(modelo_bosque2)

#Type of property has a greater influence in the price per square feet, followed by beds and region.
#tabla_contingencia <- table(housing_data_filtered$type, housing_data_filtered$region)
#prueba_chi <- chisq.test(tabla_contingencia)
#print(prueba_chi)
#We reject the null hipothesis, and we can conclude that there is a strong relationship between Type and region.
#correlaciones <- cor(housing_data_filtered[c("price_per_sqft", "beds", "baths")])
#print(correlaciones)
#install.packages("rpart")
#library(rpart)
#modelo_arbol <- rpart(price ~ region + type + beds + baths, data = housing_data_filtered)
#plot(modelo_arbol)
#text(modelo_arbol)
#housing_data_filtered$price_per_sqft <- ifelse(housing_data_filtered$sqfeet == 0, NA, housing_data_filtered$price / housing_data_filtered$sqfeet)
#for further analysis, we create a variable of price per state per square feet. 
#price_per_sqft3 <- aggregate(housing_data_filtered$price_per_sqft, by = list(housing_data_filtered$type), FUN = mean, na.rm = TRUE)
#colnames(price_per_sqft3) <- c("type", "average_price_per_sqft")
#price_per_sqft3 <- price_per_sqft3[order(price_per_sqft3$average_price_per_sqft), ]
#barplot(price_per_sqft3$average_price_per_sqft, names.arg = price_per_sqft3$type, 
#       xlab = "Type", ylab = "Average Price per sqft", main = "Average Price per sqft by #Type",
#        col = "green", las = 2)
#based on the price per sqft, the most expensive state for rent is OH. 
#new_model2 <- lm(price ~ sqfeet + beds + baths , data = housing_data_filtered)
#predict(new_model2,
#        data.frame(sqfeet = 970, beds = 2 , baths = 2 ,
#                   type = "condo" , region = "West", pets_allowed = 1))
#summary()
#View(housing_data_filtered)
#new_model3 <- lm(price ~ sqfeet + beds + baths + type + region + pets_allowed + comes_furnished , data = housing_data_filtered)
#summary(new_model3)
#housing_data_filtered$pred <- predict(new_model3, housing_data_filtered)
#cor(housing_data_filtered$pred, housing_data_filtered$price)
#predict(new_model3,
#        data.frame(sqfeet = 970, beds = 2 , baths = 2 ,
#                   type = "condo" , region = "West",comes_furnished = TRUE, pets_allowed = # 1))
#comparing model2 and model 3, the last model with an adjusted R-squared seems to predict a #price more accurately.
#region <- "Midwest"
#sqfeet <- 1741
#baths <- 2.5
#type <- "condo"
#beds <- 3
#comes_furnished <- FALSE
#pets_allowed <- 0
#new_data2<- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths, comes_furnished = comes_furnished, pets_allowed = pets_allowed, type = type )
#prediccion_precio4 <- predict(new_model3, newdata = new_data2)
#print(prediccion_precio4)
#print(paste("The predicted price for a ",type," in ", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio4))
#region <- "Midwest"
#sqfeet <- 1741
#baths <- 2.5
#beds <- 3
#new_data3<- data.frame(region = region, sqfeet = sqfeet, beds = beds, baths = baths)
#prediccion_precio5 <- predict(new_model2, newdata = new_data3)
#print(prediccion_precio5)
#print(paste("The predicted price for a property in ", region, "with", sqfeet, "square feet,", beds, "beds and", baths, "bads:", prediccion_precio5))
#View(housing_data_filtered)