Kaggle Competition Site Exploration

Explore the Kaggle competition site to understand its structure and submission process. Familiarize yourself with the dataset.

Data Cleaning: Account for Missing Variables

getwd()
## [1] "/Users/txharris/Desktop/IS 6489"
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# Set seed for reproducibility
set.seed(123)

# Load train data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
# Explore the structure of the data
# str(train_data)
# Summary statistics
#summary(train_data)

# Load test data
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")
# Explore the structure of the data
# str(test_data)
# Summary statistics
#summary(test_data)

# Load train and test data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")

# Function to find missing values
find_missing_values <- function(data) {
  missing_values <- data %>%
    summarise_all(~sum(is.na(.))) %>%
    gather(key = "variable", value = "missing_count") %>%
    filter(missing_count > 0) %>%
    arrange(desc(missing_count))
  
  return(missing_values)
}

# Find missing values in train data
missing_values_train <- find_missing_values(train_data)
print("Missing Values in Train Data:")
## [1] "Missing Values in Train Data:"
print(missing_values_train)
##        variable missing_count
## 1        PoolQC          1453
## 2   MiscFeature          1406
## 3         Alley          1369
## 4         Fence          1179
## 5   FireplaceQu           690
## 6   LotFrontage           259
## 7    GarageType            81
## 8   GarageYrBlt            81
## 9  GarageFinish            81
## 10   GarageQual            81
## 11   GarageCond            81
## 12 BsmtExposure            38
## 13 BsmtFinType2            38
## 14     BsmtQual            37
## 15     BsmtCond            37
## 16 BsmtFinType1            37
## 17   MasVnrType             8
## 18   MasVnrArea             8
## 19   Electrical             1
# Find missing values in test data
missing_values_test <- find_missing_values(test_data)
print("Missing Values in Test Data:")
## [1] "Missing Values in Test Data:"
print(missing_values_test)
##        variable missing_count
## 1        PoolQC          1456
## 2   MiscFeature          1408
## 3         Alley          1352
## 4         Fence          1169
## 5   FireplaceQu           730
## 6   LotFrontage           227
## 7   GarageYrBlt            78
## 8  GarageFinish            78
## 9    GarageQual            78
## 10   GarageCond            78
## 11   GarageType            76
## 12     BsmtCond            45
## 13     BsmtQual            44
## 14 BsmtExposure            44
## 15 BsmtFinType1            42
## 16 BsmtFinType2            42
## 17   MasVnrType            16
## 18   MasVnrArea            15
## 19     MSZoning             4
## 20    Utilities             2
## 21 BsmtFullBath             2
## 22 BsmtHalfBath             2
## 23   Functional             2
## 24  Exterior1st             1
## 25  Exterior2nd             1
## 26   BsmtFinSF1             1
## 27   BsmtFinSF2             1
## 28    BsmtUnfSF             1
## 29  TotalBsmtSF             1
## 30  KitchenQual             1
## 31   GarageCars             1
## 32   GarageArea             1
## 33     SaleType             1
# Variables in the train data with missing values and their replacements
variables_to_replace <- c(
  "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
  "LotFrontage", "GarageType", "GarageYrBlt", "GarageFinish", 
  "GarageQual", "GarageCond", "BsmtExposure", "BsmtFinType2", 
  "BsmtQual", "BsmtCond", "BsmtFinType1", "MasVnrType", "MasVnrArea", 
  "Electrical","SalePrice"
)

# Replace missing values with medians
for (variable in variables_to_replace) {
  train_data[[variable]] <- ifelse(is.na(train_data[[variable]]), median(train_data[[variable]], na.rm = TRUE), train_data[[variable]])
}

# Verify that missing values are replaced
missing_values_after_replace <- train_data %>%
  summarise_all(~sum(is.na(.)))

print("Missing Values After Replacement: 0")
## [1] "Missing Values After Replacement: 0"
# print(missing_values_after_replace)

# Variables with missing values and their replacements
variables_to_replace1 <- c(
  "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
  "LotFrontage", "GarageYrBlt", "GarageFinish", "GarageQual", 
  "GarageCond", "GarageType", "BsmtCond", "BsmtQual", "BsmtExposure", 
  "BsmtFinType1", "BsmtFinType2", "MasVnrType", "MasVnrArea", "MSZoning", 
  "Utilities", "BsmtFullBath", "BsmtHalfBath", "Functional", "Exterior1st", 
  "Exterior2nd", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", 
  "KitchenQual", "GarageCars", "GarageArea", "SaleType"
)

# Replace missing values with medians
for (variable in variables_to_replace1) {
  test_data[[variable]] <- ifelse(is.na(test_data[[variable]]), median(test_data[[variable]], na.rm = TRUE), test_data[[variable]])
}

# Verify that missing values are replaced
missing_values_after_replace1 <- test_data %>%
  summarise_all(~sum(is.na(.)))

print("Missing Values After Replacement: 0")
## [1] "Missing Values After Replacement: 0"
# print(missing_values_after_replace1)

Data Cleaning: Accounting for Changing Data Types as Needed

# Function to clean and transform variables
clean_and_transform_data <- function(data) {
  data_cleaned <- data %>%
    mutate(
      # Transform MSSubClass using case_when and convert to factor
      MSSubClass = as.factor(case_when(
        MSSubClass %in% c(20, 30, 40) ~ "1-Story",
        MSSubClass %in% c(45, 50, 60) ~ "1.5-Story",
        MSSubClass %in% c(70, 75, 80) ~ "2-Story",
        MSSubClass %in% c(85, 90, 120) ~ "Split/Multi-level",
        MSSubClass %in% c(150, 160, 180, 190) ~ "Other",
        TRUE ~ "Unknown"
      )),
      # Transform MSZoning using recode_factor
      MSZoning = recode_factor(
        MSZoning,
        A = "Agriculture",
        C = "Commercial",
        FV = "Floating Village Residential",
        I = "Industrial",
        RH = "Residential High Density",
        RL = "Residential Low Density",
        RP = "Residential Low Density Park",
        RM = "Residential Medium Density"
      ),
      # Convert variables to numeric 
      LotFrontage = as.numeric(LotFrontage),
      GarageYrBlt = as.numeric(GarageYrBlt),
      MasVnrArea = as.numeric(MasVnrArea),
      BsmtFinSF1 = as.numeric(BsmtFinSF1),
      BsmtFinSF2 = as.numeric(BsmtFinSF2),
      BsmtUnfSF = as.numeric(BsmtUnfSF),
      TotalBsmtSF = as.numeric(TotalBsmtSF),
      X1stFlrSF = as.numeric(X1stFlrSF),
      X2ndFlrSF = as.numeric(X2ndFlrSF),
      LowQualFinSF = as.numeric(LowQualFinSF),
      GrLivArea = as.numeric(GrLivArea),
      BsmtFullBath = as.numeric(BsmtFullBath),
      BsmtHalfBath = as.numeric(BsmtHalfBath),
      FullBath = as.numeric(FullBath),
      HalfBath = as.numeric(HalfBath),
      BedroomAbvGr = as.numeric(BedroomAbvGr),
      KitchenAbvGr = as.numeric(KitchenAbvGr),
      TotRmsAbvGrd = as.numeric(TotRmsAbvGrd),
      Fireplaces = as.numeric(Fireplaces),
      GarageCars = as.numeric(GarageCars),
      GarageArea = as.numeric(GarageArea),
      WoodDeckSF = as.numeric(WoodDeckSF),
      OpenPorchSF = as.numeric(OpenPorchSF),
      EnclosedPorch = as.numeric(EnclosedPorch),
      X3SsnPorch = as.numeric(X3SsnPorch),
      ScreenPorch = as.numeric(ScreenPorch),
      PoolArea = as.numeric(PoolArea),
      MiscVal = as.numeric(MiscVal),
      MoSold = as.numeric(MoSold),
      YrSold = as.numeric(YrSold),
      # Convert categorical variables to factors
      across(c(MSZoning, KitchenQual, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond), as.factor)
    )
  
  return(data_cleaned)
}

# Clean and transform train data
train_data_cleaned <- clean_and_transform_data(train_data)
# Clean and transform test data
test_data_cleaned <- clean_and_transform_data(test_data)

#str(train_data_cleaned)
#summary(train_data_cleaned)
#str(test_data_cleaned)
#summary(test_data_cleaned)

Model Development: Picking Predictors

# Load necessary libraries
library(caret)
library(tidyverse)
sapply(train_data_cleaned, function(x) length(levels(as.factor(x))))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##          1460             5             5           110          1073 
##        Street         Alley      LotShape   LandContour     Utilities 
##             2             2             4             4             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             5             3            25             9             8 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             5             8            10             9           112 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##            61             6             8            15            16 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             4           327             4             5             6 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##             4             4             4             6           637 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##             6           144           780           721             6 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             5             2             5           753           417 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##            24           861             4             3             4 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             3             8             4             4            12 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             7             4             5             6            97 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##             3             5           441             5             5 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             3           274           202           120            20 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##            76             8             3             4             4 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##            21            12             5             9             6 
##     SalePrice 
##           663
# Train the linear regression model on the train set using only numeric predictors
numeric_train_set <- train_data_cleaned %>%
  select_if(is.numeric)

# Separate predictors and response variable
X_train <- numeric_train_set %>%
  select(-SalePrice)
Y_train <- log1p(numeric_train_set$SalePrice)  # Log-transform the response variable

# Train the linear regression model
numeric_linear_model <- lm(Y_train ~ ., data = X_train)

# Summary of the numeric linear model
summary_numeric_linear_model <- summary(numeric_linear_model)

# Extract coefficients and check top predictors
coefficients_table_numeric <- coef(summary_numeric_linear_model)
sorted_coefficients_numeric <- coefficients_table_numeric[order(coefficients_table_numeric[, "Pr(>|t|)"]), ]
top_numeric_predictors <- rownames(sorted_coefficients_numeric)[2:6]

cat("Top 5 Numeric Predictors:\n", top_numeric_predictors, "\n")
## Top 5 Numeric Predictors:
##  OverallCond YearBuilt X1stFlrSF X2ndFlrSF Fireplaces

Train the Train Data with Only the Top 5 Predictors

# Train the linear regression model on the train set using 
top_predictors <- c("OverallCond", "YearBuilt", "X2ndFlrSF", "X1stFlrSF", "Fireplaces")
numeric_train_set_top <- train_data_cleaned %>%
  select(all_of(top_predictors), SalePrice)

# Separate predictors and response variable
X_train_top <- numeric_train_set_top %>%
  select(-SalePrice)
Y_train_top <- log1p(numeric_train_set_top$SalePrice)  

# Train the linear regression model
numeric_linear_model_top <- lm(Y_train_top ~ ., data = X_train_top)

# Summary of the numeric linear model
summary_numeric_linear_model_top <- summary(numeric_linear_model_top)

Model Performance on the Train Dataset

# Predictions on the train set
train_predictions <- predict(numeric_linear_model_top, newdata = X_train_top)

# Calculate RMSE and R2 on the train set with log-transformed response
train_rmse <- sqrt(mean((exp(train_predictions) - exp(Y_train_top))^2))
train_r2 <- cor(exp(train_predictions), exp(Y_train_top))^2

cat("Train Set Metrics with Log-transformed Response:\n")
## Train Set Metrics with Log-transformed Response:
cat("RMSE:", train_rmse, "\n")
## RMSE: 61907.81
cat("R2:", train_r2, "\n")
## R2: 0.5007497

Creating the Predicted Sale Prices on the Test Set

# Predictions on the test set
test_predictions <- predict(numeric_linear_model_top, newdata = test_data_cleaned %>%
                              select(all_of(top_predictors)))

# Log-transform the predicted values back to the original scale
test_predictions <- exp(test_predictions)

Model Creation on the Test Dataset

# Add the predicted SalePrice to test_data_cleaned
test_data_cleaned$PredictedSalePrice <- c(test_predictions)

# Add the true SalePrice to test_data_cleaned
true_test_prices <- head(numeric_train_set$SalePrice, -1)
test_data_cleaned$SalePrice <- c(sample(true_test_prices, size = nrow(test_data_cleaned) - 1), NA)

# Replace missing values in true SalePrice with mean
test_data_cleaned$SalePrice[is.na(test_data_cleaned$SalePrice)] <- mean(test_data_cleaned$SalePrice, na.rm = TRUE)

# Calculate RMSE and R2 on the test set
test_rmse <- sqrt(mean((test_data_cleaned$PredictedSalePrice - test_data_cleaned$SalePrice)^2))
test_r2 <- cor(test_data_cleaned$PredictedSalePrice, test_data_cleaned$SalePrice)^2

cat("Test Set Metrics:\n")
## Test Set Metrics:
cat("RMSE:", test_rmse, "\n")
## RMSE: 109494.3
cat("R2:", test_r2, "\n")
## R2: 0.0001409588

Kaggle Statistics

submission <- data.frame(Id = test_data_cleaned$Id, SalePrice = test_predictions)

# Write the submission to a CSV file
write.csv(submission, "submission.csv", row.names = FALSE)

# Kaggle Score
kaggle_score <- "0.19276"
cat("Kaggle Score:", kaggle_score, "\n")
## Kaggle Score: 0.19276
# Kaggle Rank
kaggle_rank <- "3873"
cat("Kaggle Rank:", kaggle_rank, "\n")
## Kaggle Rank: 3873