library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.3
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
library("rpart")
library("rpart.plot")
## Warning: package 'rpart.plot' was built under R version 4.0.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.0.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
housing_data <- read.csv("data/data.csv")
housing_data$Recession <- factor(housing_data$Recession,levels=c(0,1), labels=c("No","Yes"))
colnames(housing_data)[1] <- "Dates"
class(housing_data$Recession)
## [1] "factor"
housing_data_po <- subset(housing_data,HousePrice_Min >=0 & HousePrice_Max >=0 & HousePrice_AVG >=0 & ZHVI_BottomTier >=0 & ZHVI_MiddleTier >=0 & ZHVI_TopTier >=0)
var1 <- housing_data_po$Dates
var2 <- housing_data_po$HousePrice_Min
var3 <- housing_data_po$HousePrice_Max
var4 <- housing_data_po$HousePrice_AVG
xform <- list(categoryorder = "array",
categoryarray = housing_data_po$Dates)
p <-plot_ly(data = housing_data_po,
x = ~var1,
y = ~var2, name="Low-end Home Prices",
type = "scatter",
mode = "lines+markers") %>%
layout(title = "my title",
xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="High-end Home Prices")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Average Home Prices")
# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
shapes = list(
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
y0 = 0, y1 = 700000, yref = "y"),
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
y0 = 0, y1 = 700000, yref = "y")))
p
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
var1 <- housing_data_po$Dates
var2 <- housing_data_po$ZHVI_BottomTier
var3 <- housing_data_po$ZHVI_MiddleTier
var4 <- housing_data_po$ZHVI_TopTier
xform <- list(categoryorder = "array",
categoryarray = housing_data_po$Dates)
p <-plot_ly(data = housing_data_po,
x = ~var1,
y = ~var2, name="Bottom-Tier Home Prices",
type = "scatter",
mode = "lines+markers") %>%
layout(title = "my title",
xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Middle-Tier Home Prices")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Top-Tier Home Prices")
# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
shapes = list(
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
y0 = 0, y1 = 400000, yref = "y"),
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
y0 = 0, y1 = 400000, yref = "y")))
p
var1 <- housing_data_po$Dates
var2 <- housing_data_po$UnemploymentRate
var3 <- housing_data_po$MortgagesDelinquencyPercent
var4 <- housing_data_po$FixedRateMortgageAverage_30Y
xform <- list(categoryorder = "array",
categoryarray = housing_data_po$Dates)
p <-plot_ly(data = housing_data_po,
x = ~Dates,
y = ~UnemploymentRate, name="Unemployment Rate",
type = "scatter",
mode = "lines+markers") %>%
layout(title = "my title",
xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Mortgage Delinquency %")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Fixed Rate Mortgage 30-Years")
# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
shapes = list(
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
y0 = 0, y1 = 12.5, yref = "y"),
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
y0 = 0, y1 = 12.5, yref = "y")))
p
housing_data_po$wages_to_avgPrice <- c(housing_data_po$ZHVI_MiddleTier / housing_data_po$AverageWages)
var1 <- housing_data_po$Dates
var2 <- housing_data_po$UnemploymentRate
var3 <- housing_data_po$MortgagesDelinquencyPercent
var4 <- housing_data_po$wages_to_avgPrice
xform <- list(categoryorder = "array",
categoryarray = housing_data_po$Dates)
p <-plot_ly(data = housing_data_po,
x = ~Dates,
y = ~UnemploymentRate, name="Unemployment Rate",
type = "scatter",
mode = "lines+markers") %>%
layout(title = "my title",
xaxis = xform)
p <- add_trace(p, x = ~var1, y = ~var3, type="scatter", mode="lines+markers", name="Mortgage Delinquency %")
p <- add_trace(p, x = ~var1, y = ~var4, type="scatter", mode="lines+markers", name="Wages-to-Avg Home Price Ratio")
# add shapes to the layout
p <- layout(p, title = 'Housing Trends w/ Recession Periods (shaded areas)',
shapes = list(
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "3/31/2001", x1 = "10/31/2001", xref = "x",
y0 = 0, y1 = 12.5, yref = "y"),
list(type = "rect",
fillcolor = "rgb(30, 100, 120)", line = list(color = "grey"), opacity = 0.3,
x0 = "12/31/2007", x1 = "5/31/2009", xref = "x",
y0 = 0, y1 = 12.5, yref = "y")))
p
• Select 90% of data for training and 10% for testing;
df <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train <- housing_data_po[df, ]
test <- housing_data_po[-df, ]
• Build a model with training data (90% data) to predict Recession, using ‘HousePrice_Min’, ‘HousePrice_Max’, and ‘HousePrice_AVG’ variables.
nbDem <- naiveBayes(Recession ~ ZHVI_BottomTier + ZHVI_MiddleTier + ZHVI_TopTier, train)
nbDem
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.8974359 0.1025641
##
## Conditional probabilities:
## ZHVI_BottomTier
## Y [,1] [,2]
## No 101336.0 20355.02
## Yes 110794.2 17867.46
##
## ZHVI_MiddleTier
## Y [,1] [,2]
## No 167362.9 37472.85
## Yes 179042.3 30514.54
##
## ZHVI_TopTier
## Y [,1] [,2]
## No 290545.8 68427.75
## Yes 307934.7 52198.19
• Run prediction with the model on test data (10% data):
nb_prediction <- predict(nbDem, test, type = "class")
confusionMatrix(data = nb_prediction,
reference =test$Recession,
dnn = c("Predicted", "Actual"),
mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 25 2
## Yes 0 0
##
## Accuracy : 0.9259
## 95% CI : (0.7571, 0.9909)
## No Information Rate : 0.9259
## P-Value [Acc > NIR] : 0.6768
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.4795
##
## Precision : 0.9259
## Recall : 1.0000
## F1 : 0.9615
## Prevalence : 0.9259
## Detection Rate : 0.9259
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
• Build a model with training data (90% data) to predict Recession, using the new variables.
df1 <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train1 <- housing_data_po[df1, ]
test1 <- housing_data_po[-df1, ]
• Run prediction with the model on test data (10% data) and record the following scores:
nbDem1 <- naiveBayes(Recession ~ UnemploymentRate + MortgagesDelinquencyPercent + wages_to_avgPrice + ZHVI_MiddleTier, train1)
nbDem1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.90598291 0.09401709
##
## Conditional probabilities:
## UnemploymentRate
## Y [,1] [,2]
## No 5.826887 1.687333
## Yes 6.136364 1.625554
##
## MortgagesDelinquencyPercent
## Y [,1] [,2]
## No 4.481981 3.374208
## Yes 5.049545 2.262831
##
## wages_to_avgPrice
## Y [,1] [,2]
## No 4.676131 0.6170001
## Yes 4.991823 0.4149206
##
## ZHVI_MiddleTier
## Y [,1] [,2]
## No 166953.7 36577.91
## Yes 179787.4 29878.03
nb_prediction1 <- predict(nbDem1, test1, type = "class")
confusionMatrix(data = nb_prediction1,
reference =test1$Recession,
dnn = c("Predicted", "Actual"),
mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 23 4
## Yes 0 0
##
## Accuracy : 0.8519
## 95% CI : (0.6627, 0.9581)
## No Information Rate : 0.8519
## P-Value [Acc > NIR] : 0.6293
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.1336
##
## Precision : 0.8519
## Recall : 1.0000
## F1 : 0.9200
## Prevalence : 0.8519
## Detection Rate : 0.8519
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
• Create a new attribute that is the logarithm of each attribute with an extremely wide, “skew” distribution.
hist(housing_data_po$ZHVI_MiddleTier, xlab="Housing Middle-Tier Price")
hist(housing_data_po$ZHVI_BottomTier, xlab="Housing Bottom-Tier Price")
hist(housing_data_po$ZHVI_TopTier, xlab="Housing Top-Tier Price")
hist(housing_data_po$wages_to_avgPrice, xlab="'Wage-to-Avg Home Price' Ratio")
hist(housing_data_po$UnemploymentRate, xlab="Unemployment Rates")
hist(housing_data_po$MortgagesDelinquencyPercent, xlab="Mortgages Delinquency %")
• The following attributes were identified and logged due to wide skew distribution:
housing_data_po$ZHVI_BottomTier_LOG <- log10(housing_data_po$HousePrice_Min)
housing_data_po$ZHVI_TopTier_LOG <-log10(housing_data_po$HousePrice_Max)
housing_data_po$ZHVI_MiddleTier_LOG <-log10(housing_data_po$HousePrice_AVG)
housing_data_po$wages_to_avgPrice_LOG <-log10(housing_data_po$wages_to_avgPrice)
housing_data_po$UnemploymentRate_LOG <-log10(housing_data_po$UnemploymentRate)
housing_data_po$MortgagesDelinquencyPercent_LOG <-log10(housing_data_po$MortgagesDelinquencyPercent)
hist(housing_data_po$ZHVI_BottomTier_LOG, xlab="Housing Bottom-Tier Price (LOG)")
hist(housing_data_po$ZHVI_TopTier_LOG, xlab="Housing Top-Tier Price (LOG)")
hist(housing_data_po$ZHVI_MiddleTier_LOG, xlab="Housing Middle-Tier Price (LOG)")
hist(housing_data_po$wages_to_avgPrice_LOG, xlab="'Wage-to-Avg Home Price' Ratio (LOG)")
hist(housing_data_po$UnemploymentRate_LOG, xlab="Unemployment Rate (LOG)")
hist(housing_data_po$MortgagesDelinquencyPercent_LOG, xlab="Mortgages Delinquency % (LOG)")
df_log <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train_log <- housing_data_po[df_log, ]
test_log <- housing_data_po[-df_log, ]
nbDem_log <- naiveBayes(Recession ~ ZHVI_BottomTier_LOG + ZHVI_MiddleTier_LOG + ZHVI_TopTier_LOG, train_log)
nbDem_log
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.91025641 0.08974359
##
## Conditional probabilities:
## ZHVI_BottomTier_LOG
## Y [,1] [,2]
## No 4.819488 0.14177230
## Yes 4.800744 0.08545143
##
## ZHVI_MiddleTier_LOG
## Y [,1] [,2]
## No 5.207438 0.10838285
## Yes 5.241090 0.08481566
##
## ZHVI_TopTier_LOG
## Y [,1] [,2]
## No 5.573411 0.1662201
## Yes 5.597482 0.1495092
nb_prediction_log <- predict(nbDem_log, test_log, type = "class")
confusionMatrix(data = nb_prediction_log,
reference =test_log$Recession,
dnn = c("Predicted", "Actual"),
mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 22 5
## Yes 0 0
##
## Accuracy : 0.8148
## 95% CI : (0.6192, 0.937)
## No Information Rate : 0.8148
## P-Value [Acc > NIR] : 0.61657
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.07364
##
## Precision : 0.8148
## Recall : 1.0000
## F1 : 0.8980
## Prevalence : 0.8148
## Detection Rate : 0.8148
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
df_log1 <- sort(sample(nrow(housing_data_po), nrow(housing_data_po)*.9))
train_log1 <- housing_data_po[df_log1, ]
test_log1 <- housing_data_po[-df_log1, ]
nbDem_log1 <- naiveBayes(Recession ~ UnemploymentRate_LOG + MortgagesDelinquencyPercent_LOG + wages_to_avgPrice_LOG + ZHVI_MiddleTier_LOG, train_log1)
nbDem_log1
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.8931624 0.1068376
##
## Conditional probabilities:
## UnemploymentRate_LOG
## Y [,1] [,2]
## No 0.7556018 0.114998
## Yes 0.7635107 0.106800
##
## MortgagesDelinquencyPercent_LOG
## Y [,1] [,2]
## No 0.553676 0.3119041
## Yes 0.621927 0.2191180
##
## wages_to_avgPrice_LOG
## Y [,1] [,2]
## No 0.6656293 0.05371096
## Yes 0.6958349 0.03970397
##
## ZHVI_MiddleTier_LOG
## Y [,1] [,2]
## No 5.213731 0.10394877
## Yes 5.242191 0.08290356
nb_prediction_log1 <- predict(nbDem_log1, test_log1, type = "class")
confusionMatrix(data = nb_prediction_log1,
reference =test_log1$Recession,
dnn = c("Predicted", "Actual"),
mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Actual
## Predicted No Yes
## No 26 1
## Yes 0 0
##
## Accuracy : 0.963
## 95% CI : (0.8103, 0.9991)
## No Information Rate : 0.963
## P-Value [Acc > NIR] : 0.7358
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.0000
##
## Precision : 0.9630
## Recall : 1.0000
## F1 : 0.9811
## Prevalence : 0.9630
## Detection Rate : 0.9630
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
| Model | Variables | Model 1 & 2 are Classification without Transformation | ||||
|---|---|---|---|---|---|---|
| Correct % | Precision | Recall | F | Kappa | ||
| 1 | HousePrice_Min, HousePrice_Max, HousePrice_AVG | 88.89% | .8889 | 1.0 | .9412 | 0.0 |
| 2 |
HousePrice_AVG, FixedRateMortgageAverage_30Y, UnemploymentRate |
92.59% | .9259 | 1.0 | .9615 | 0.0 |
| 3 |
HousePrice_Min_LOG, HousePrice_Max_LOG, HousePrice_AVG_LOG |
88.89% | .8889 | 1.0 | .9412 | 0.0 |
| 4 |
HousePrice_AVG_LOG, FixedRateMortgageAverage_30Y_LOG, UnemploymentRate_LOG |
85.19% | .8519 | 1.0 | .92 | 0.0 |