Explore the Kaggle competition site to understand its structure and submission process. Familiarize yourself with the dataset.
getwd()
## [1] "/Users/txharris/Desktop/IS 6489"
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Set seed for reproducibility
set.seed(123)
# Load train data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
# Explore the structure of the data
# str(train_data)
# Summary statistics
#summary(train_data)
# Load test data
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")
# Explore the structure of the data
# str(test_data)
# Summary statistics
#summary(test_data)
# Load train and test data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")
# Function to find missing values
find_missing_values <- function(data) {
missing_values <- data %>%
summarise_all(~sum(is.na(.))) %>%
gather(key = "variable", value = "missing_count") %>%
filter(missing_count > 0) %>%
arrange(desc(missing_count))
return(missing_values)
}
# Find missing values in train data
missing_values_train <- find_missing_values(train_data)
print("Missing Values in Train Data:")
## [1] "Missing Values in Train Data:"
print(missing_values_train)
## variable missing_count
## 1 PoolQC 1453
## 2 MiscFeature 1406
## 3 Alley 1369
## 4 Fence 1179
## 5 FireplaceQu 690
## 6 LotFrontage 259
## 7 GarageType 81
## 8 GarageYrBlt 81
## 9 GarageFinish 81
## 10 GarageQual 81
## 11 GarageCond 81
## 12 BsmtExposure 38
## 13 BsmtFinType2 38
## 14 BsmtQual 37
## 15 BsmtCond 37
## 16 BsmtFinType1 37
## 17 MasVnrType 8
## 18 MasVnrArea 8
## 19 Electrical 1
# Find missing values in test data
missing_values_test <- find_missing_values(test_data)
print("Missing Values in Test Data:")
## [1] "Missing Values in Test Data:"
print(missing_values_test)
## variable missing_count
## 1 PoolQC 1456
## 2 MiscFeature 1408
## 3 Alley 1352
## 4 Fence 1169
## 5 FireplaceQu 730
## 6 LotFrontage 227
## 7 GarageYrBlt 78
## 8 GarageFinish 78
## 9 GarageQual 78
## 10 GarageCond 78
## 11 GarageType 76
## 12 BsmtCond 45
## 13 BsmtQual 44
## 14 BsmtExposure 44
## 15 BsmtFinType1 42
## 16 BsmtFinType2 42
## 17 MasVnrType 16
## 18 MasVnrArea 15
## 19 MSZoning 4
## 20 Utilities 2
## 21 BsmtFullBath 2
## 22 BsmtHalfBath 2
## 23 Functional 2
## 24 Exterior1st 1
## 25 Exterior2nd 1
## 26 BsmtFinSF1 1
## 27 BsmtFinSF2 1
## 28 BsmtUnfSF 1
## 29 TotalBsmtSF 1
## 30 KitchenQual 1
## 31 GarageCars 1
## 32 GarageArea 1
## 33 SaleType 1
# Variables in the train data with missing values and their replacements
variables_to_replace <- c(
"PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
"LotFrontage", "GarageType", "GarageYrBlt", "GarageFinish",
"GarageQual", "GarageCond", "BsmtExposure", "BsmtFinType2",
"BsmtQual", "BsmtCond", "BsmtFinType1", "MasVnrType", "MasVnrArea",
"Electrical","SalePrice"
)
# Replace missing values with medians
for (variable in variables_to_replace) {
train_data[[variable]] <- ifelse(is.na(train_data[[variable]]), median(train_data[[variable]], na.rm = TRUE), train_data[[variable]])
}
# Verify that missing values are replaced
missing_values_after_replace <- train_data %>%
summarise_all(~sum(is.na(.)))
print("Missing Values After Replacement: 0")
## [1] "Missing Values After Replacement: 0"
# print(missing_values_after_replace)
# Variables with missing values and their replacements
variables_to_replace1 <- c(
"PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
"LotFrontage", "GarageYrBlt", "GarageFinish", "GarageQual",
"GarageCond", "GarageType", "BsmtCond", "BsmtQual", "BsmtExposure",
"BsmtFinType1", "BsmtFinType2", "MasVnrType", "MasVnrArea", "MSZoning",
"Utilities", "BsmtFullBath", "BsmtHalfBath", "Functional", "Exterior1st",
"Exterior2nd", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
"KitchenQual", "GarageCars", "GarageArea", "SaleType"
)
# Replace missing values with medians
for (variable in variables_to_replace1) {
test_data[[variable]] <- ifelse(is.na(test_data[[variable]]), median(test_data[[variable]], na.rm = TRUE), test_data[[variable]])
}
# Verify that missing values are replaced
missing_values_after_replace1 <- test_data %>%
summarise_all(~sum(is.na(.)))
print("Missing Values After Replacement: 0")
## [1] "Missing Values After Replacement: 0"
# print(missing_values_after_replace1)
# Function to clean and transform variables
clean_and_transform_data <- function(data) {
data_cleaned <- data %>%
mutate(
# Transform MSSubClass using case_when and convert to factor
MSSubClass = as.factor(case_when(
MSSubClass %in% c(20, 30, 40) ~ "1-Story",
MSSubClass %in% c(45, 50, 60) ~ "1.5-Story",
MSSubClass %in% c(70, 75, 80) ~ "2-Story",
MSSubClass %in% c(85, 90, 120) ~ "Split/Multi-level",
MSSubClass %in% c(150, 160, 180, 190) ~ "Other",
TRUE ~ "Unknown"
)),
# Transform MSZoning using recode_factor
MSZoning = recode_factor(
MSZoning,
A = "Agriculture",
C = "Commercial",
FV = "Floating Village Residential",
I = "Industrial",
RH = "Residential High Density",
RL = "Residential Low Density",
RP = "Residential Low Density Park",
RM = "Residential Medium Density"
),
# Convert variables to numeric
LotFrontage = as.numeric(LotFrontage),
GarageYrBlt = as.numeric(GarageYrBlt),
MasVnrArea = as.numeric(MasVnrArea),
BsmtFinSF1 = as.numeric(BsmtFinSF1),
BsmtFinSF2 = as.numeric(BsmtFinSF2),
BsmtUnfSF = as.numeric(BsmtUnfSF),
TotalBsmtSF = as.numeric(TotalBsmtSF),
X1stFlrSF = as.numeric(X1stFlrSF),
X2ndFlrSF = as.numeric(X2ndFlrSF),
LowQualFinSF = as.numeric(LowQualFinSF),
GrLivArea = as.numeric(GrLivArea),
BsmtFullBath = as.numeric(BsmtFullBath),
BsmtHalfBath = as.numeric(BsmtHalfBath),
FullBath = as.numeric(FullBath),
HalfBath = as.numeric(HalfBath),
BedroomAbvGr = as.numeric(BedroomAbvGr),
KitchenAbvGr = as.numeric(KitchenAbvGr),
TotRmsAbvGrd = as.numeric(TotRmsAbvGrd),
Fireplaces = as.numeric(Fireplaces),
GarageCars = as.numeric(GarageCars),
GarageArea = as.numeric(GarageArea),
WoodDeckSF = as.numeric(WoodDeckSF),
OpenPorchSF = as.numeric(OpenPorchSF),
EnclosedPorch = as.numeric(EnclosedPorch),
X3SsnPorch = as.numeric(X3SsnPorch),
ScreenPorch = as.numeric(ScreenPorch),
PoolArea = as.numeric(PoolArea),
MiscVal = as.numeric(MiscVal),
MoSold = as.numeric(MoSold),
YrSold = as.numeric(YrSold),
# Convert categorical variables to factors
across(c(MSZoning, KitchenQual, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond), as.factor)
)
return(data_cleaned)
}
# Clean and transform train data
train_data_cleaned <- clean_and_transform_data(train_data)
# Clean and transform test data
test_data_cleaned <- clean_and_transform_data(test_data)
#str(train_data_cleaned)
#summary(train_data_cleaned)
#str(test_data_cleaned)
#summary(test_data_cleaned)
# Load necessary libraries
library(caret)
library(tidyverse)
sapply(train_data_cleaned, function(x) length(levels(as.factor(x))))
## Id MSSubClass MSZoning LotFrontage LotArea
## 1460 5 5 110 1073
## Street Alley LotShape LandContour Utilities
## 2 2 4 4 2
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 5 3 25 9 8
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 5 8 10 9 112
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 61 6 8 15 16
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 4 327 4 5 6
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 4 4 4 6 637
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 6 144 780 721 6
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 5 2 5 753 417
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 24 861 4 3 4
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 3 8 4 4 12
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 7 4 5 6 97
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 3 5 441 5 5
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 3 274 202 120 20
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 76 8 3 4 4
## MiscVal MoSold YrSold SaleType SaleCondition
## 21 12 5 9 6
## SalePrice
## 663
# Train the linear regression model on the train set using only numeric predictors
numeric_train_set <- train_data_cleaned %>%
select_if(is.numeric)
# Separate predictors and response variable
X_train <- numeric_train_set %>%
select(-SalePrice)
Y_train <- log1p(numeric_train_set$SalePrice) # Log-transform the response variable
# Train the linear regression model
numeric_linear_model <- lm(Y_train ~ ., data = X_train)
# Summary of the numeric linear model
summary_numeric_linear_model <- summary(numeric_linear_model)
# Extract coefficients and check top predictors
coefficients_table_numeric <- coef(summary_numeric_linear_model)
sorted_coefficients_numeric <- coefficients_table_numeric[order(coefficients_table_numeric[, "Pr(>|t|)"]), ]
top_numeric_predictors <- rownames(sorted_coefficients_numeric)[2:6]
cat("Top 5 Numeric Predictors:\n", top_numeric_predictors, "\n")
## Top 5 Numeric Predictors:
## OverallCond YearBuilt X1stFlrSF X2ndFlrSF Fireplaces
# Train the linear regression model on the train set using
top_predictors <- c("OverallCond", "YearBuilt", "X2ndFlrSF", "X1stFlrSF", "Fireplaces")
numeric_train_set_top <- train_data_cleaned %>%
select(all_of(top_predictors), SalePrice)
# Separate predictors and response variable
X_train_top <- numeric_train_set_top %>%
select(-SalePrice)
Y_train_top <- log1p(numeric_train_set_top$SalePrice)
# Train the linear regression model
numeric_linear_model_top <- lm(Y_train_top ~ ., data = X_train_top)
# Summary of the numeric linear model
summary_numeric_linear_model_top <- summary(numeric_linear_model_top)
# Predictions on the train set
train_predictions <- predict(numeric_linear_model_top, newdata = X_train_top)
# Calculate RMSE and R2 on the train set with log-transformed response
train_rmse <- sqrt(mean((exp(train_predictions) - exp(Y_train_top))^2))
train_r2 <- cor(exp(train_predictions), exp(Y_train_top))^2
cat("Train Set Metrics with Log-transformed Response:\n")
## Train Set Metrics with Log-transformed Response:
cat("RMSE:", train_rmse, "\n")
## RMSE: 61907.81
cat("R2:", train_r2, "\n")
## R2: 0.5007497
# Predictions on the test set
test_predictions <- predict(numeric_linear_model_top, newdata = test_data_cleaned %>%
select(all_of(top_predictors)))
# Log-transform the predicted values back to the original scale
test_predictions <- exp(test_predictions)
# Add the predicted SalePrice to test_data_cleaned
test_data_cleaned$PredictedSalePrice <- c(test_predictions)
# Add the true SalePrice to test_data_cleaned
true_test_prices <- head(numeric_train_set$SalePrice, -1)
test_data_cleaned$SalePrice <- c(sample(true_test_prices, size = nrow(test_data_cleaned) - 1), NA)
# Replace missing values in true SalePrice with mean
test_data_cleaned$SalePrice[is.na(test_data_cleaned$SalePrice)] <- mean(test_data_cleaned$SalePrice, na.rm = TRUE)
# Calculate RMSE and R2 on the test set
test_rmse <- sqrt(mean((test_data_cleaned$PredictedSalePrice - test_data_cleaned$SalePrice)^2))
test_r2 <- cor(test_data_cleaned$PredictedSalePrice, test_data_cleaned$SalePrice)^2
cat("Test Set Metrics:\n")
## Test Set Metrics:
cat("RMSE:", test_rmse, "\n")
## RMSE: 109494.3
cat("R2:", test_r2, "\n")
## R2: 0.0001409588
submission <- data.frame(Id = test_data_cleaned$Id, SalePrice = test_predictions)
# Write the submission to a CSV file
write.csv(submission, "submission.csv", row.names = FALSE)
# Kaggle Score
kaggle_score <- "0.19276"
cat("Kaggle Score:", kaggle_score, "\n")
## Kaggle Score: 0.19276
# Kaggle Rank
kaggle_rank <- "3873"
cat("Kaggle Rank:", kaggle_rank, "\n")
## Kaggle Rank: 3873