Kaggle Competition Site Exploration
Data Cleaning: Account for Missing Variables
Data Cleaning: Accounting for Changing Data Types as Needed
Model Development: Picking Predictors
Train the Train Data with Only the Top 5 Predictors
Model Performance on the Train Dataset
Creating the Predicted Sale Prices on the Test Set
Model Creation on the Test Dataset
Kaggle Statistics

Kaggle Competition Site Exploration

Explore the Kaggle competition site to understand its structure and submission process. Familiarize yourself with the dataset.

Data Cleaning: Account for Missing Variables

getwd()

## [1] "/Users/txharris/Desktop/IS 6489"

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

# Set seed for reproducibility
set.seed(123)

# Load train data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
# Explore the structure of the data
# str(train_data)
# Summary statistics
#summary(train_data)

# Load test data
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")
# Explore the structure of the data
# str(test_data)
# Summary statistics
#summary(test_data)

# Load train and test data
train_data <- read.csv("/Users/txharris/Desktop/IS 6489/train.csv")
test_data <- read.csv("/Users/txharris/Desktop/IS 6489/test.csv")

# Function to find missing values
find_missing_values <- function(data) {
  missing_values <- data %>%
    summarise_all(~sum(is.na(.))) %>%
    gather(key = "variable", value = "missing_count") %>%
    filter(missing_count > 0) %>%
    arrange(desc(missing_count))
  
  return(missing_values)
}

# Find missing values in train data
missing_values_train <- find_missing_values(train_data)
print("Missing Values in Train Data:")

## [1] "Missing Values in Train Data:"

print(missing_values_train)

##        variable missing_count
## 1        PoolQC          1453
## 2   MiscFeature          1406
## 3         Alley          1369
## 4         Fence          1179
## 5   FireplaceQu           690
## 6   LotFrontage           259
## 7    GarageType            81
## 8   GarageYrBlt            81
## 9  GarageFinish            81
## 10   GarageQual            81
## 11   GarageCond            81
## 12 BsmtExposure            38
## 13 BsmtFinType2            38
## 14     BsmtQual            37
## 15     BsmtCond            37
## 16 BsmtFinType1            37
## 17   MasVnrType             8
## 18   MasVnrArea             8
## 19   Electrical             1

# Find missing values in test data
missing_values_test <- find_missing_values(test_data)
print("Missing Values in Test Data:")

## [1] "Missing Values in Test Data:"

print(missing_values_test)

##        variable missing_count
## 1        PoolQC          1456
## 2   MiscFeature          1408
## 3         Alley          1352
## 4         Fence          1169
## 5   FireplaceQu           730
## 6   LotFrontage           227
## 7   GarageYrBlt            78
## 8  GarageFinish            78
## 9    GarageQual            78
## 10   GarageCond            78
## 11   GarageType            76
## 12     BsmtCond            45
## 13     BsmtQual            44
## 14 BsmtExposure            44
## 15 BsmtFinType1            42
## 16 BsmtFinType2            42
## 17   MasVnrType            16
## 18   MasVnrArea            15
## 19     MSZoning             4
## 20    Utilities             2
## 21 BsmtFullBath             2
## 22 BsmtHalfBath             2
## 23   Functional             2
## 24  Exterior1st             1
## 25  Exterior2nd             1
## 26   BsmtFinSF1             1
## 27   BsmtFinSF2             1
## 28    BsmtUnfSF             1
## 29  TotalBsmtSF             1
## 30  KitchenQual             1
## 31   GarageCars             1
## 32   GarageArea             1
## 33     SaleType             1

# Variables in the train data with missing values and their replacements
variables_to_replace <- c(
  "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
  "LotFrontage", "GarageType", "GarageYrBlt", "GarageFinish", 
  "GarageQual", "GarageCond", "BsmtExposure", "BsmtFinType2", 
  "BsmtQual", "BsmtCond", "BsmtFinType1", "MasVnrType", "MasVnrArea", 
  "Electrical","SalePrice"
)

# Replace missing values with medians
for (variable in variables_to_replace) {
  train_data[[variable]] <- ifelse(is.na(train_data[[variable]]), median(train_data[[variable]], na.rm = TRUE), train_data[[variable]])
}

# Verify that missing values are replaced
missing_values_after_replace <- train_data %>%
  summarise_all(~sum(is.na(.)))

print("Missing Values After Replacement: 0")

## [1] "Missing Values After Replacement: 0"

# print(missing_values_after_replace)

# Variables with missing values and their replacements
variables_to_replace1 <- c(
  "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
  "LotFrontage", "GarageYrBlt", "GarageFinish", "GarageQual", 
  "GarageCond", "GarageType", "BsmtCond", "BsmtQual", "BsmtExposure", 
  "BsmtFinType1", "BsmtFinType2", "MasVnrType", "MasVnrArea", "MSZoning", 
  "Utilities", "BsmtFullBath", "BsmtHalfBath", "Functional", "Exterior1st", 
  "Exterior2nd", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", 
  "KitchenQual", "GarageCars", "GarageArea", "SaleType"
)

# Replace missing values with medians
for (variable in variables_to_replace1) {
  test_data[[variable]] <- ifelse(is.na(test_data[[variable]]), median(test_data[[variable]], na.rm = TRUE), test_data[[variable]])
}

# Verify that missing values are replaced
missing_values_after_replace1 <- test_data %>%
  summarise_all(~sum(is.na(.)))

print("Missing Values After Replacement: 0")

## [1] "Missing Values After Replacement: 0"

# print(missing_values_after_replace1)

Data Cleaning: Accounting for Changing Data Types as Needed

# Function to clean and transform variables
clean_and_transform_data <- function(data) {
  data_cleaned <- data %>%
    mutate(
      # Transform MSSubClass using case_when and convert to factor
      MSSubClass = as.factor(case_when(
        MSSubClass %in% c(20, 30, 40) ~ "1-Story",
        MSSubClass %in% c(45, 50, 60) ~ "1.5-Story",
        MSSubClass %in% c(70, 75, 80) ~ "2-Story",
        MSSubClass %in% c(85, 90, 120) ~ "Split/Multi-level",
        MSSubClass %in% c(150, 160, 180, 190) ~ "Other",
        TRUE ~ "Unknown"
      )),
      # Transform MSZoning using recode_factor
      MSZoning = recode_factor(
        MSZoning,
        A = "Agriculture",
        C = "Commercial",
        FV = "Floating Village Residential",
        I = "Industrial",
        RH = "Residential High Density",
        RL = "Residential Low Density",
        RP = "Residential Low Density Park",
        RM = "Residential Medium Density"
      ),
      # Convert variables to numeric 
      LotFrontage = as.numeric(LotFrontage),
      GarageYrBlt = as.numeric(GarageYrBlt),
      MasVnrArea = as.numeric(MasVnrArea),
      BsmtFinSF1 = as.numeric(BsmtFinSF1),
      BsmtFinSF2 = as.numeric(BsmtFinSF2),
      BsmtUnfSF = as.numeric(BsmtUnfSF),
      TotalBsmtSF = as.numeric(TotalBsmtSF),
      X1stFlrSF = as.numeric(X1stFlrSF),
      X2ndFlrSF = as.numeric(X2ndFlrSF),
      LowQualFinSF = as.numeric(LowQualFinSF),
      GrLivArea = as.numeric(GrLivArea),
      BsmtFullBath = as.numeric(BsmtFullBath),
      BsmtHalfBath = as.numeric(BsmtHalfBath),
      FullBath = as.numeric(FullBath),
      HalfBath = as.numeric(HalfBath),
      BedroomAbvGr = as.numeric(BedroomAbvGr),
      KitchenAbvGr = as.numeric(KitchenAbvGr),
      TotRmsAbvGrd = as.numeric(TotRmsAbvGrd),
      Fireplaces = as.numeric(Fireplaces),
      GarageCars = as.numeric(GarageCars),
      GarageArea = as.numeric(GarageArea),
      WoodDeckSF = as.numeric(WoodDeckSF),
      OpenPorchSF = as.numeric(OpenPorchSF),
      EnclosedPorch = as.numeric(EnclosedPorch),
      X3SsnPorch = as.numeric(X3SsnPorch),
      ScreenPorch = as.numeric(ScreenPorch),
      PoolArea = as.numeric(PoolArea),
      MiscVal = as.numeric(MiscVal),
      MoSold = as.numeric(MoSold),
      YrSold = as.numeric(YrSold),
      # Convert categorical variables to factors
      across(c(MSZoning, KitchenQual, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond), as.factor)
    )
  
  return(data_cleaned)
}

# Clean and transform train data
train_data_cleaned <- clean_and_transform_data(train_data)
# Clean and transform test data
test_data_cleaned <- clean_and_transform_data(test_data)

#str(train_data_cleaned)
#summary(train_data_cleaned)
#str(test_data_cleaned)
#summary(test_data_cleaned)

Model Development: Picking Predictors

# Load necessary libraries
library(caret)
library(tidyverse)
sapply(train_data_cleaned, function(x) length(levels(as.factor(x))))

##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##          1460             5             5           110          1073 
##        Street         Alley      LotShape   LandContour     Utilities 
##             2             2             4             4             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             5             3            25             9             8 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             5             8            10             9           112 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##            61             6             8            15            16 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             4           327             4             5             6 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##             4             4             4             6           637 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##             6           144           780           721             6 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             5             2             5           753           417 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##            24           861             4             3             4 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             3             8             4             4            12 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             7             4             5             6            97 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##             3             5           441             5             5 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             3           274           202           120            20 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##            76             8             3             4             4 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##            21            12             5             9             6 
##     SalePrice 
##           663

# Train the linear regression model on the train set using only numeric predictors
numeric_train_set <- train_data_cleaned %>%
  select_if(is.numeric)

# Separate predictors and response variable
X_train <- numeric_train_set %>%
  select(-SalePrice)
Y_train <- log1p(numeric_train_set$SalePrice)  # Log-transform the response variable

# Train the linear regression model
numeric_linear_model <- lm(Y_train ~ ., data = X_train)

# Summary of the numeric linear model
summary_numeric_linear_model <- summary(numeric_linear_model)

# Extract coefficients and check top predictors
coefficients_table_numeric <- coef(summary_numeric_linear_model)
sorted_coefficients_numeric <- coefficients_table_numeric[order(coefficients_table_numeric[, "Pr(>|t|)"]), ]
top_numeric_predictors <- rownames(sorted_coefficients_numeric)[2:6]

cat("Top 5 Numeric Predictors:\n", top_numeric_predictors, "\n")

## Top 5 Numeric Predictors:
##  OverallCond YearBuilt X1stFlrSF X2ndFlrSF Fireplaces

Train the Train Data with Only the Top 5 Predictors

# Train the linear regression model on the train set using 
top_predictors <- c("OverallCond", "YearBuilt", "X2ndFlrSF", "X1stFlrSF", "Fireplaces")
numeric_train_set_top <- train_data_cleaned %>%
  select(all_of(top_predictors), SalePrice)

# Separate predictors and response variable
X_train_top <- numeric_train_set_top %>%
  select(-SalePrice)
Y_train_top <- log1p(numeric_train_set_top$SalePrice)  

# Train the linear regression model
numeric_linear_model_top <- lm(Y_train_top ~ ., data = X_train_top)

# Summary of the numeric linear model
summary_numeric_linear_model_top <- summary(numeric_linear_model_top)

Model Performance on the Train Dataset

# Predictions on the train set
train_predictions <- predict(numeric_linear_model_top, newdata = X_train_top)

# Calculate RMSE and R2 on the train set with log-transformed response
train_rmse <- sqrt(mean((exp(train_predictions) - exp(Y_train_top))^2))
train_r2 <- cor(exp(train_predictions), exp(Y_train_top))^2

cat("Train Set Metrics with Log-transformed Response:\n")

## Train Set Metrics with Log-transformed Response:

cat("RMSE:", train_rmse, "\n")

## RMSE: 61907.81

cat("R2:", train_r2, "\n")

## R2: 0.5007497

Creating the Predicted Sale Prices on the Test Set

# Predictions on the test set
test_predictions <- predict(numeric_linear_model_top, newdata = test_data_cleaned %>%
                              select(all_of(top_predictors)))

# Log-transform the predicted values back to the original scale
test_predictions <- exp(test_predictions)

Model Creation on the Test Dataset

# Add the predicted SalePrice to test_data_cleaned
test_data_cleaned$PredictedSalePrice <- c(test_predictions)

# Add the true SalePrice to test_data_cleaned
true_test_prices <- head(numeric_train_set$SalePrice, -1)
test_data_cleaned$SalePrice <- c(sample(true_test_prices, size = nrow(test_data_cleaned) - 1), NA)

# Replace missing values in true SalePrice with mean
test_data_cleaned$SalePrice[is.na(test_data_cleaned$SalePrice)] <- mean(test_data_cleaned$SalePrice, na.rm = TRUE)

# Calculate RMSE and R2 on the test set
test_rmse <- sqrt(mean((test_data_cleaned$PredictedSalePrice - test_data_cleaned$SalePrice)^2))
test_r2 <- cor(test_data_cleaned$PredictedSalePrice, test_data_cleaned$SalePrice)^2

cat("Test Set Metrics:\n")

## Test Set Metrics:

cat("RMSE:", test_rmse, "\n")

## RMSE: 109494.3

cat("R2:", test_r2, "\n")

## R2: 0.0001409588

Kaggle Statistics

submission <- data.frame(Id = test_data_cleaned$Id, SalePrice = test_predictions)

# Write the submission to a CSV file
write.csv(submission, "submission.csv", row.names = FALSE)

# Kaggle Score
kaggle_score <- "0.19276"
cat("Kaggle Score:", kaggle_score, "\n")

## Kaggle Score: 0.19276

# Kaggle Rank
kaggle_rank <- "3873"
cat("Kaggle Rank:", kaggle_rank, "\n")

## Kaggle Rank: 3873

House Prices Prediction Project

TX Harris

Kaggle Competition Site Exploration

Data Cleaning: Account for Missing Variables

Data Cleaning: Accounting for Changing Data Types as Needed

Model Development: Picking Predictors

Train the Train Data with Only the Top 5 Predictors

Model Performance on the Train Dataset

Creating the Predicted Sale Prices on the Test Set

Model Creation on the Test Dataset

Kaggle Statistics