INFO 659 Project

Marina Clements
Anthony Dellapia
Brian Dusape
Victor Enchautegui

11/28/2020

Load R Libaries

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.3

library(e1071)

## Warning: package 'e1071' was built under R version 4.0.3

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## Loading required package: lattice

library("rpart")
library("rpart.plot")

## Warning: package 'rpart.plot' was built under R version 4.0.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plotly)

## Warning: package 'plotly' was built under R version 4.0.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 4.0.3

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

Data preparation and Cleansing

Load data and initial data conversion/transformation:

Load “data.csv” into data frame variable in R using read.csv().

housing_data <- read.csv("data/data.csv")

Convert the following attributes into as nominal (categorical, factor) attributes: Recession.

housing_data$Recession <- factor(housing_data$Recession,levels=c(0,1), labels=c("No","Yes"))
colnames(housing_data)[1] <- "Dates"

Use class() function check on Recession, they should ALL be “factor” variables.

class(housing_data$Recession)

## [1] "factor"

Create a filtered dataset with only non-negative amounts.

Use the subset() function to select only positive values:

housing_data_po <- subset(housing_data,HousePrice_Min >=0 & HousePrice_Max >=0 & HousePrice_AVG >=0 &   ZHVI_BottomTier >=0 &   ZHVI_MiddleTier >=0 & ZHVI_TopTier >=0)

Plot and review data:

var1 <- housing_data_po$Dates
var2 <- housing_data_po$HousePrice_Min
var3 <- housing_data_po$HousePrice_Max
var4 <- housing_data_po$HousePrice_AVG
xform <- list(categoryorder = "array",
              categoryarray = housing_data_po$Dates)

p <-plot_ly(data = housing_data_po,
        x = ~var1,
        y = ~var2, name="Low-end Home Prices",
        type = "scatter",
        mode = "lines+markers") %>%
        layout(title = "my title",
               xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="High-end Home Prices")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Average Home Prices")


# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
             shapes = list(
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
                    y0 = 0, y1 = 700000, yref = "y"),
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
                    y0 = 0, y1 = 700000, yref = "y")))
p

## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

var1 <- housing_data_po$Dates
var2 <- housing_data_po$ZHVI_BottomTier
var3 <- housing_data_po$ZHVI_MiddleTier
var4 <- housing_data_po$ZHVI_TopTier
xform <- list(categoryorder = "array",
              categoryarray = housing_data_po$Dates)

p <-plot_ly(data = housing_data_po,
        x = ~var1,
        y = ~var2, name="Bottom-Tier Home Prices",
        type = "scatter",
        mode = "lines+markers") %>%
        layout(title = "my title",
               xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Middle-Tier Home Prices")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Top-Tier Home Prices")


# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
             shapes = list(
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
                    y0 = 0, y1 = 400000, yref = "y"),
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
                    y0 = 0, y1 = 400000, yref = "y")))
p

var1 <- housing_data_po$Dates
var2 <- housing_data_po$UnemploymentRate
var3 <- housing_data_po$MortgagesDelinquencyPercent
var4 <- housing_data_po$FixedRateMortgageAverage_30Y
xform <- list(categoryorder = "array",
              categoryarray = housing_data_po$Dates)

p <-plot_ly(data = housing_data_po,
        x = ~Dates,
        y = ~UnemploymentRate, name="Unemployment Rate",
        type = "scatter",
        mode = "lines+markers") %>%
        layout(title = "my title",
               xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Mortgage Delinquency %")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Fixed Rate Mortgage 30-Years")


# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
             shapes = list(
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
                    y0 = 0, y1 = 12.5, yref = "y"),
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
                    y0 = 0, y1 = 12.5, yref = "y")))
p

Data Transformation

housing_data_po$wages_to_avgPrice <- c(housing_data_po$ZHVI_MiddleTier / housing_data_po$AverageWages)

var1 <- housing_data_po$Dates
var2 <- housing_data_po$UnemploymentRate
var3 <- housing_data_po$MortgagesDelinquencyPercent
var4 <- housing_data_po$wages_to_avgPrice
xform <- list(categoryorder = "array",
              categoryarray = housing_data_po$Dates)

p <-plot_ly(data = housing_data_po,
        x = ~Dates,
        y = ~UnemploymentRate, name="Unemployment Rate",
        type = "scatter",
        mode = "lines+markers") %>%
        layout(title = "my title",
               xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Mortgage Delinquency %")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Wages-to-Avg Home Price Ratio")


# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
             shapes = list(
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
                    y0 = 0, y1 = 12.5, yref = "y"),
               list(type = "rect",
                    fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
                    x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
                    y0 = 0, y1 = 12.5, yref = "y")))
p

Classification/Modeling

Model 1: Model with Recession and ‘HousePrice_Min’, ‘HousePrice_Max’, and ‘HousePrice_AVG’ variables, and evaluate:

• Select 90% of data for training and 10% for testing;

df <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train <- housing_data_po[df, ]
test <- housing_data_po[-df, ]

• Build a model with training data (90% data) to predict Recession, using ‘HousePrice_Min’, ‘HousePrice_Max’, and ‘HousePrice_AVG’ variables.

nbDem <- naiveBayes(Recession ~ ZHVI_BottomTier + ZHVI_MiddleTier + ZHVI_TopTier, train)
nbDem

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##        No       Yes 
## 0.8974359 0.1025641 
## 
## Conditional probabilities:
##      ZHVI_BottomTier
## Y         [,1]     [,2]
##   No  101336.0 20355.02
##   Yes 110794.2 17867.46
## 
##      ZHVI_MiddleTier
## Y         [,1]     [,2]
##   No  167362.9 37472.85
##   Yes 179042.3 30514.54
## 
##      ZHVI_TopTier
## Y         [,1]     [,2]
##   No  290545.8 68427.75
##   Yes 307934.7 52198.19

• Run prediction with the model on test data (10% data):

nb_prediction <- predict(nbDem, test, type = "class")

confusionMatrix(data = nb_prediction,
                reference =test$Recession, 
                dnn = c("Predicted", "Actual"),
                mode = "prec_recall")

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted No Yes
##       No  25   2
##       Yes  0   0
##                                           
##                Accuracy : 0.9259          
##                  95% CI : (0.7571, 0.9909)
##     No Information Rate : 0.9259          
##     P-Value [Acc > NIR] : 0.6768          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##               Precision : 0.9259          
##                  Recall : 1.0000          
##                      F1 : 0.9615          
##              Prevalence : 0.9259          
##          Detection Rate : 0.9259          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
##

Model 2: Perform data transformation with new variables:

• Build a model with training data (90% data) to predict Recession, using the new variables.

df1 <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train1 <- housing_data_po[df1, ]
test1 <- housing_data_po[-df1, ]

• Run prediction with the model on test data (10% data) and record the following scores:

nbDem1 <- naiveBayes(Recession ~ UnemploymentRate + MortgagesDelinquencyPercent + wages_to_avgPrice + ZHVI_MiddleTier, train1)
nbDem1

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         No        Yes 
## 0.90598291 0.09401709 
## 
## Conditional probabilities:
##      UnemploymentRate
## Y         [,1]     [,2]
##   No  5.826887 1.687333
##   Yes 6.136364 1.625554
## 
##      MortgagesDelinquencyPercent
## Y         [,1]     [,2]
##   No  4.481981 3.374208
##   Yes 5.049545 2.262831
## 
##      wages_to_avgPrice
## Y         [,1]      [,2]
##   No  4.676131 0.6170001
##   Yes 4.991823 0.4149206
## 
##      ZHVI_MiddleTier
## Y         [,1]     [,2]
##   No  166953.7 36577.91
##   Yes 179787.4 29878.03

nb_prediction1 <- predict(nbDem1, test1, type = "class")

confusionMatrix(data = nb_prediction1,
                reference =test1$Recession, 
                dnn = c("Predicted", "Actual"),
                mode = "prec_recall")

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted No Yes
##       No  23   4
##       Yes  0   0
##                                           
##                Accuracy : 0.8519          
##                  95% CI : (0.6627, 0.9581)
##     No Information Rate : 0.8519          
##     P-Value [Acc > NIR] : 0.6293          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##               Precision : 0.8519          
##                  Recall : 1.0000          
##                      F1 : 0.9200          
##              Prevalence : 0.8519          
##          Detection Rate : 0.8519          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
##

Model 3 and 4 Log Variables: Examine attribute value distribution (histogram), and perform log transformation on attributes that are skewed:

• Create a new attribute that is the logarithm of each attribute with an extremely wide, “skew” distribution.

hist(housing_data_po$ZHVI_MiddleTier, xlab="Housing Middle-Tier Price")

hist(housing_data_po$ZHVI_BottomTier, xlab="Housing Bottom-Tier Price")

hist(housing_data_po$ZHVI_TopTier, xlab="Housing Top-Tier Price")

hist(housing_data_po$wages_to_avgPrice, xlab="'Wage-to-Avg Home Price' Ratio")

hist(housing_data_po$UnemploymentRate, xlab="Unemployment Rates")

hist(housing_data_po$MortgagesDelinquencyPercent, xlab="Mortgages Delinquency %")

• The following attributes were identified and logged due to wide skew distribution:

housing_data_po$ZHVI_BottomTier_LOG <- log10(housing_data_po$HousePrice_Min)
housing_data_po$ZHVI_TopTier_LOG <-log10(housing_data_po$HousePrice_Max) 
housing_data_po$ZHVI_MiddleTier_LOG <-log10(housing_data_po$HousePrice_AVG)
housing_data_po$wages_to_avgPrice_LOG <-log10(housing_data_po$wages_to_avgPrice)
housing_data_po$UnemploymentRate_LOG <-log10(housing_data_po$UnemploymentRate)
housing_data_po$MortgagesDelinquencyPercent_LOG <-log10(housing_data_po$MortgagesDelinquencyPercent)

hist(housing_data_po$ZHVI_BottomTier_LOG, xlab="Housing Bottom-Tier Price (LOG)")

hist(housing_data_po$ZHVI_TopTier_LOG, xlab="Housing Top-Tier Price (LOG)")

hist(housing_data_po$ZHVI_MiddleTier_LOG, xlab="Housing Middle-Tier Price (LOG)")

hist(housing_data_po$wages_to_avgPrice_LOG, xlab="'Wage-to-Avg Home Price' Ratio (LOG)")

hist(housing_data_po$UnemploymentRate_LOG, xlab="Unemployment Rate (LOG)")

hist(housing_data_po$MortgagesDelinquencyPercent_LOG, xlab="Mortgages Delinquency % (LOG)")

Model 3: Build a model with training data (90% data) to predict Recession using the log-transformed attributes:

df_log <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train_log <- housing_data_po[df_log, ]
test_log <- housing_data_po[-df_log, ]
nbDem_log <- naiveBayes(Recession ~ ZHVI_BottomTier_LOG + ZHVI_MiddleTier_LOG + ZHVI_TopTier_LOG, train_log)
nbDem_log

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         No        Yes 
## 0.91025641 0.08974359 
## 
## Conditional probabilities:
##      ZHVI_BottomTier_LOG
## Y         [,1]       [,2]
##   No  4.819488 0.14177230
##   Yes 4.800744 0.08545143
## 
##      ZHVI_MiddleTier_LOG
## Y         [,1]       [,2]
##   No  5.207438 0.10838285
##   Yes 5.241090 0.08481566
## 
##      ZHVI_TopTier_LOG
## Y         [,1]      [,2]
##   No  5.573411 0.1662201
##   Yes 5.597482 0.1495092

nb_prediction_log <- predict(nbDem_log, test_log, type = "class")

confusionMatrix(data = nb_prediction_log,
                reference =test_log$Recession, 
                dnn = c("Predicted", "Actual"),
                mode = "prec_recall")

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted No Yes
##       No  22   5
##       Yes  0   0
##                                          
##                Accuracy : 0.8148         
##                  95% CI : (0.6192, 0.937)
##     No Information Rate : 0.8148         
##     P-Value [Acc > NIR] : 0.61657        
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : 0.07364        
##                                          
##               Precision : 0.8148         
##                  Recall : 1.0000         
##                      F1 : 0.8980         
##              Prevalence : 0.8148         
##          Detection Rate : 0.8148         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : No             
##

Model 4: Build a model with training data (90% data) to predict Recession using different log-transformed attributes:

df_log1 <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train_log1 <- housing_data_po[df_log1, ]
test_log1 <- housing_data_po[-df_log1, ]
nbDem_log1 <- naiveBayes(Recession ~ UnemploymentRate_LOG + MortgagesDelinquencyPercent_LOG + wages_to_avgPrice_LOG + ZHVI_MiddleTier_LOG, train_log1)
nbDem_log1

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##        No       Yes 
## 0.8931624 0.1068376 
## 
## Conditional probabilities:
##      UnemploymentRate_LOG
## Y          [,1]     [,2]
##   No  0.7556018 0.114998
##   Yes 0.7635107 0.106800
## 
##      MortgagesDelinquencyPercent_LOG
## Y         [,1]      [,2]
##   No  0.553676 0.3119041
##   Yes 0.621927 0.2191180
## 
##      wages_to_avgPrice_LOG
## Y          [,1]       [,2]
##   No  0.6656293 0.05371096
##   Yes 0.6958349 0.03970397
## 
##      ZHVI_MiddleTier_LOG
## Y         [,1]       [,2]
##   No  5.213731 0.10394877
##   Yes 5.242191 0.08290356

nb_prediction_log1 <- predict(nbDem_log1, test_log1, type = "class")

confusionMatrix(data = nb_prediction_log1,
                reference =test_log1$Recession, 
                dnn = c("Predicted", "Actual"),
                mode = "prec_recall")

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted No Yes
##       No  26   1
##       Yes  0   0
##                                           
##                Accuracy : 0.963           
##                  95% CI : (0.8103, 0.9991)
##     No Information Rate : 0.963           
##     P-Value [Acc > NIR] : 0.7358          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##               Precision : 0.9630          
##                  Recall : 1.0000          
##                      F1 : 0.9811          
##              Prevalence : 0.9630          
##          Detection Rate : 0.9630          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
##

Evaluation and Results

Model	Variables	Model 1 & 2 are Classification without Transformation
Model	Variables	Correct %	Precision	Recall	F	Kappa
1	HousePrice_Min, HousePrice_Max, HousePrice_AVG	88.89%	.8889	1.0	.9412	0.0
2	HousePrice_AVG, FixedRateMortgageAverage_30Y, UnemploymentRate	92.59%	.9259	1.0	.9615	0.0
3	HousePrice_Min_LOG, HousePrice_Max_LOG, HousePrice_AVG_LOG	88.89%	.8889	1.0	.9412	0.0
4	HousePrice_AVG_LOG, FixedRateMortgageAverage_30Y_LOG, UnemploymentRate_LOG	85.19%	.8519	1.0	.92	0.0