Data 605 Final Project

options(scipen = 100, 
        dplyr.summarise.inform = FALSE)

set.seed(101)

library(dplyr)
library(ggplot2)
library(tidyr)
library(GGally)
library(ggridges)
library(car)
library(MASS)
library(dummies)

Problem 1

Samples

Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu = \sigma = (N+1)/2\).

n <- 605

# create a vector for all integers between 1 and N
nlist <- seq(1,n,1)

X <- sample(nlist, 10000, replace = T)

# mu = sigma = (n+1)/2
ms <- (n + 1) / 2

Y <- rnorm(10000, mean = ms, sd = ms)

Probability

Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.

x <- median(X)
x

## [1] 303

summary(Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -840.0   100.2   296.5   304.6   512.5  1657.7

y <- as.numeric(quantile(Y, 0.25))
y

## [1] 100.1909

df <- data.frame(X,Y)

\(P(X>x | X>y)\)

What is the conditional probabilty of X > x given X > y?

df$a <- ifelse(df$X > x & df$X > y, 1, 0)
df$a_condition <- ifelse(df$X > y, 1, 0)
sum(df$a) / sum(df$a_condition)

## [1] 0.5983203

\(P(X>x , Y>x)\)

What is the probability of X > x and Y > x?

df$b <- ifelse(df$X > x & df$Y > x, 1, 0)
sum(df$b) / nrow(df)

## [1] 0.2456

\(P(X<x | X>y)\)

What is the conditional probability of X < x given X > y?

df$c <- ifelse(df$X < x & df$X > y, 1, 0)
df$c_condition <- ifelse(df$X > y, 1, 0)

sum(df$c) / sum(df$c_condition)

## [1] 0.3992801

Confusion Matrix

Investigate whether \(P(X>x \space\) and \(\space Y>y) = P(X>x)P(Y>y)\) by building a table and evaluating the marginal and joint probabilities.

df$y_status <- ifelse(df$Y > y, 'Y > y', 'Y <= y')

# X > x
dfx <- filter(df, X > x) %>%
  group_by(y_status) %>%
  summarize('X > x' = n())

# X < x
dfx2 <- filter(df, X <= x) %>%
  group_by(y_status) %>%
  summarize('X <= x' = n())

# all X conditions
df_all <- group_by(df, y_status) %>%
  summarize('total' = n())

# Combine
df2 <- inner_join(dfx, dfx2, by = c('y_status' = 'y_status')) %>%
  inner_join(df_all, by = c('y_status' = 'y_status'))

df2

## # A tibble: 2 x 4
##   y_status `X > x` `X <= x` total
##   <chr>      <int>    <int> <int>
## 1 Y <= y      1244     1256  2500
## 2 Y > y       3743     3757  7500

row_totals <- data.frame('y_status'='total', 
                         'X > x' = nrow(filter(df, X > x)), 
                         'X =< x' = nrow(filter(df, X <= x)),
                         'total' = nrow(df))

colnames(row_totals)[2] <- 'X > x'
colnames(row_totals)[3] <- 'X <= x'

df3 <- bind_rows(df2, row_totals)

df3

## # A tibble: 3 x 4
##   y_status `X > x` `X <= x` total
##   <chr>      <int>    <int> <int>
## 1 Y <= y      1244     1256  2500
## 2 Y > y       3743     3757  7500
## 3 total       4987     5013 10000

df4 <- mutate(df3, 
              `X > x` = `X > x` / nrow(df),
              `X <= x` = `X <= x` / nrow(df),
              total = total / nrow(df)) 

df4

## # A tibble: 3 x 4
##   y_status `X > x` `X <= x` total
##   <chr>      <dbl>    <dbl> <dbl>
## 1 Y <= y     0.124    0.126  0.25
## 2 Y > y      0.374    0.376  0.75
## 3 total      0.499    0.501  1

\(P(X>x \space\) and \(\space Y>y) = P(X>x)P(Y>y)\)

left <- filter(df4, y_status == 'Y > y')$`X > x`
left

## [1] 0.3743

right <- filter(df4, y_status == 'total')$`X > x` * filter(df4, y_status == 'Y > y')$total
right

## [1] 0.374025

left == right

## [1] FALSE

right - left

## [1] -0.000275

They’re almost equal

Significance Tests

Check to see if the independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?

# visual/plot check
dplyr::select(df, X, Y) %>%
  gather(var, val) %>%
  ggplot(aes(x = val, fill = var)) +
  geom_density(alpha = .5) +
  geom_vline(xintercept = c(x), linetype=2, color = 'sienna') +
  geom_vline(xintercept = c(y), linetype=2, color = 'forestgreen') +
  theme_classic() +
  labs(x = element_blank(), y = element_blank()) +
  theme(legend.title = element_blank())

# get counts
df_test <- dplyr::select(df2, -y_status, -total)
df_test

## # A tibble: 2 x 2
##   `X > x` `X <= x`
##     <int>    <int>
## 1    1244     1256
## 2    3743     3757

# as matrix
dfm <- data.matrix(df_test)

fisher.test(df_test)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  df_test
## p-value = 0.9081
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9071551 1.0894989
## sample estimates:
## odds ratio 
##  0.9941506

chisq.test(df_test)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df_test
## X-squared = 0.0108, df = 1, p-value = 0.9172

If we check the plot, looking only at the right side of both lines, there’s considerable overlap. Visually, since there’s so much overlap, they’re not independent.

Looking at both Fisher’s Exact test and Pearson’s Chi-squared test, neither have low enough p-values to reject the null hypothesis.

Fisher’s exact test is an exact significance test that is used for small contigency tables.

Chi-square test is a signficance test based on the distribution of the squared difference of observed and expected values.

Since our contingency table is 2x2, Fisher’s test is appropriate here.

Problem 2

Loading Data

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following.

train <- read.csv('https://github.com/dataconsumer101/data605/raw/main/train.csv', stringsAsFactors = F)
test <- read.csv('https://github.com/dataconsumer101/data605/raw/main/test.csv', stringsAsFactors = F)

Descriptive Statistics

5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

Neighborhoods

I think one of the first things people look for when buying a house is to narrow down the location. Let’s take a look at the neighborhoods.

ggplot(train, aes(x = SalePrice/1000, y = reorder(Neighborhood, SalePrice))) +
  geom_density_ridges() +
  theme_bw() +
  labs(x = 'Sale Price (Thousands)',
       y = 'Neighborhood')

## Picking joint bandwidth of 17.2

ggplot(train, aes(x = SalePrice/1000, y = reorder(Neighborhood, SalePrice))) +
  geom_boxplot() +
  theme_bw() +
  labs(x = 'Sale Price (Thousands)',
       y = 'Neighborhood')

First, looking at neighborhoods with narrower distributions or smaller boxes – the houses in those neighborhoods might be and look homogenous. Neighborhoods like SawyerW and the neighborhoods at the top of the two above charts have wider distributions, which suggest there’s all types of properites there. This could mean small and large homes, apartments, or even commercial properties.

Let’s take a look at what type of properties are zoned for each neighorhood.

neighborhood_rank <- train %>%
  group_by(Neighborhood) %>%
  summarize(avg = mean(SalePrice))

train %>%
  group_by(MSZoning, Neighborhood) %>%
  summarize(count = n()) %>%
  inner_join(neighborhood_rank, by = c('Neighborhood' = 'Neighborhood')) %>%
  ggplot(aes(x = reorder(Neighborhood, avg), y = count, fill = MSZoning)) +
  geom_col() +
  labs(x = 'Neighborhood', y = element_blank()) +
  coord_flip()

Starting from the top through the middle, we’re seeing blue, or RL – residential low density. This refers to separate houses on land.

In Somerst, there’s some FV – or Floating Village Residential. This sounds like a closed community and seems to make up the majority of the houses sold in that neighborhood.

SaywerW, which was the neighborhood in the middle with the wide spread has mostly residential low density with a bit of RH, or residential high density, or apartments.

As we get to the lower end of the chart, neighborhoods with the lowest prices, we see more purple, or Residential Medium Density and some red in IDOTTRR, or commerical properties.

Another obvious factor that goes into price consideration for properties is the quality. Let’s take a look at the qualities of each neighborhood.

inner_join(train, neighborhood_rank, by = c('Neighborhood' = 'Neighborhood')) %>%
  ggplot(aes(x = OverallCond)) +
  geom_histogram(bins=10) +
  facet_wrap(~reorder(Neighborhood, -avg), scales = 'free_y') +
  theme_bw() +
  labs(x = element_blank(), y = element_blank())

Wow, I’m surprised. The highest average sale price starts in the top left, then goes across, to the next row. The top 4 most expensive neighborhoods have mostly average quality properites, and some of the lowest price neighborhoods have a good amount of higher quality properties. Considering the distribution of prices was pretty wide for the most expensive neighborhoods, this could mean that these quality ratings aren’t a strong indicator of price.

Lot Size

As you arrive at a house, the first thing you notice is how big the house is and how much land it covers. Lets see there’s a relationship between the lot size and the sale price.

ggplot(train, aes(x = LotArea/1000, y = SalePrice/1000)) +
  geom_point(alpha = .1) +
  geom_smooth(method='lm') +
  labs(x = 'Lot Area in Thousands',
       y = 'Sale Price in Thousands') +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

It looks like there could be a linear relationship. Let’s try to spread it out a bit.

# scale x log10
ggplot(train, aes(x = LotArea/1000, y = SalePrice/1000)) +
  geom_point(alpha = .1) +
  scale_x_log10() +
  geom_smooth(method='lm') +
  labs(x = 'Lot Area in Thousands',
       y = 'Sale Price in Thousands') +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

# powertransform
pt <- powerTransform(train$LotArea)$lambda

ggplot(train, aes(x = LotArea^pt, y = SalePrice/1000)) +
  geom_point(alpha = .1) +
  scale_x_log10() +
  geom_smooth(method='lm') +
  labs(x = 'Lot Area Transformed',
       y = 'Sale Price in Thousands') +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

First Floor Square Feet

Once you step into a house, you’re on the first floor and you get the first impression of the house. Let’s see if the size of the first floor has a relationship to the sale price.

ggplot(train, aes(x = X1stFlrSF, y = SalePrice/1000)) +
  geom_point(alpha = .1) +
  geom_smooth(method='lm') +
  labs(x = 'First Floor Square Feet',
       y = 'Sale Price in Thousands') +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

No need to look any futher, this relationship is pretty obvious.

Correlation Matrix

Let’s test the correlations between these three numerical variables:

LotFrontage – Linear feet of street connected to property
YearBuilt – Original construction date
X2ndFlrSF - Second floor square feet

three <- dplyr::select(train, LotFrontage, YearBuilt, X2ndFlrSF)

cmatrix <- cor(filter(three, !is.na(LotFrontage)))
cmatrix

##             LotFrontage   YearBuilt   X2ndFlrSF
## LotFrontage  1.00000000  0.12334947  0.08017727
## YearBuilt    0.12334947  1.00000000 -0.01786209
## X2ndFlrSF    0.08017727 -0.01786209  1.00000000

ggpairs(three)

# Lot Frontage vs Year Built
cor.test(train$LotFrontage, train$YearBuilt, conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$LotFrontage and train$YearBuilt
## t = 4.304, df = 1199, p-value = 0.00001814
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.08673626 0.15962991
## sample estimates:
##       cor 
## 0.1233495

It looks like we can be 80% confident that the correlation of the two variables is between roughly 7.6% and 17%. With such a small p value, we can reject the null hypothesis.

# Lot Frontage vs 2nd Floor Square Feet
cor.test(train$LotFrontage, train$X2ndFlrSF, conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$LotFrontage and train$X2ndFlrSF
## t = 2.7852, df = 1199, p-value = 0.005433
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.04329658 0.11683974
## sample estimates:
##        cor 
## 0.08017727

I initially thought that these two variables would be colinear in some way– but it looks like there’s nearly no correlation here. With an 80% confidence interval, there correlation is between 4.3% and 11.7%. The p-value is low enough again to reject the null hypothesis.

# Year Built vs 2nd Floor Square Feet
cor.test(train$YearBuilt, train$X2ndFlrSF, conf.level = 0.80)

## 
##  Pearson's product-moment correlation
## 
## data:  train$YearBuilt and train$X2ndFlrSF
## t = 0.39361, df = 1458, p-value = 0.6939
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  -0.02326203  0.04385413
## sample estimates:
##        cor 
## 0.01030766

The correlation between these two variables, with an 80% confidence interval, is between -2.3% and 4.4%. Since the p-value is so high, we can’t say that this is correlation is different from zero.

Considering I tried to choose variables that were unrelated and also that we used a 80% confidence interval, I would be concerned about a familywise error for the first two tests. Both intervals were both in the single digits, so if we raised the confidence interval of the tests, we could possibly extend the range to include zero in both cases. However, in both cases, the p-value was small enough to be sure. I also tested 99% confidence intervals to see what would happen, and it didn’t look like the ranges reached far enough to cover zero.

Precision Matrix

5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.

pmatrix <- solve(cmatrix)
pmatrix

##             LotFrontage   YearBuilt   X2ndFlrSF
## LotFrontage   1.0224989 -0.12762977 -0.08426090
## YearBuilt    -0.1276298  1.01625009  0.02838536
## X2ndFlrSF    -0.0842609  0.02838536  1.00726283

cpmx <- round(cmatrix %*% pmatrix, 4)
cpmx

##             LotFrontage YearBuilt X2ndFlrSF
## LotFrontage           1         0         0
## YearBuilt             0         1         0
## X2ndFlrSF             0         0         1

pcmx <- round(pmatrix %*% cmatrix, 4)
pcmx

##             LotFrontage YearBuilt X2ndFlrSF
## LotFrontage           1         0         0
## YearBuilt             0         1         0
## X2ndFlrSF             0         0         1

LU Decomposition

factorize <- function(A) {
  
  # Assuming that the matrix is square
  m <- nrow(A)
  n <- m
  
  # Identity matrix base
  L <- matrix(0, nrow = m, ncol = n)
  
  # add 1 to diagonal
  for (i in 1:m) {
    L[i,i] <- 1
  }
  
  # start with A
  U <- A
  
  # loop through rows 2 thru m
  for (i in 2:m) {
    
    # loop through columns 1 thru (i - 1)
    for (j in 1:(i-1)) {
        
        # determine factor to zero out the spot
        f <- (U[i,j] / U[j,j]) * -1

        # zero out the spot using identity location and factor
        U[i,] <- (f * U[j, ]) + U[i,]
        
        # inverse factor added to L
        L[i,j] <- f * -1
        
    }

  }
  
  # return list L and U
  return(list(L,U))
  
}

lu <- factorize(cmatrix)

# L
lu[1]

## [[1]]
##            [,1]        [,2] [,3]
## [1,] 1.00000000  0.00000000    0
## [2,] 0.12334947  1.00000000    0
## [3,] 0.08017727 -0.02818068    1

# U
lu[2]

## [[1]]
##             LotFrontage YearBuilt   X2ndFlrSF
## LotFrontage           1 0.1233495  0.08017727
## YearBuilt             0 0.9847849 -0.02775191
## X2ndFlrSF             0 0.0000000  0.99278954

Right Skew

5 points. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Since we saw earlier that lot area is skewed to the right, let’s work with that.

# check skew
ggplot(train, aes(x = LotArea/1000)) +
  geom_density() +
  labs(title = 'Lot Area Distribution',
       y = element_blank(),
       x = 'Lot Area Square Feet (Thousands)') +
  theme_classic()

# fit distribution
fd <- fitdistr(train$LotArea, densfun = 'exponential')
fd

##         rate     
##   0.000095085704 
##  (0.000002488507)

lambda <- fd$estimate
lambda

##         rate 
## 0.0000950857

lsample <- rexp(1000, lambda)

ls_df <- data.frame('LotArea' = lsample, 'source' = 'lambda sample') 

edf <- dplyr::select(train, LotArea) %>%
  mutate(source = 'training data') %>%
  bind_rows(ls_df)

ggplot(edf, aes(x = LotArea/1000, fill = source)) +
  geom_density(alpha = .5) +
  labs(title = 'Lot Area Distribution',
       y = element_blank(),
       x = 'Lot Area Square Feet (Thousands)') +
  theme_bw()

The distributions are similiar but the data doesn’t exactly follow an exponential distribution, even though it does have a right skew.

Percentiles

Exponential Distribution Using CDF

# 5th Percentile
qexp(0.05, lambda)

## [1] 539.4428

# 95th Percentile
qexp(0.95, lambda)

## [1] 31505.6

About 540 - 31.5k

95% Confidence Interval – Assuming Normal Distribution

100 - 95 / 2 = 2.5%

mean <- mean(train$LotArea)
sd <- sd(train$LotArea)

# lower boundary
qnorm(.025, mean, sd)

## [1] -9046.092

# upper boundary
qnorm(.025, mean, sd, lower.tail = F)

## [1] 30079.75

Since the data isn’t actually normally distributed, it doesn’t make sense that the lower bound is negative.

90% Confidence Interval – Assuming Normal Distribution

For comparison’s sake, let’s also check the 90% confidence interval assuming a normal distribution

# lower boundary
qnorm(.05, mean, sd)

## [1] -5900.892

# upper boundary
qnorm(.05, mean, sd, lower.tail = F)

## [1] 26934.55

About -5.9k to 27k

5th and 95th Percentile of Data

# 5th Percentile
quantile(train$LotArea, .05)

##     5% 
## 3311.7

# 95th Percentile
quantile(train$LotArea, .95)

##      95% 
## 17401.15

About 3.3k to 17.4k

In the data, there’s a sharp rise and a sharp fall, along with a right skew. This is likely related to zoning neighborhoods to be about the same size. Since homes can’t have negative square feet, there’s only possibility for the distribution to skew right, which it does– because of very large properties and possibly commerical properties.

Multiple Regression

10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.

Preparing the Data

# Remove ID field
vars <- dplyr::select(train, -Id)

# create an empty dataframe to track null variables
null_df <- data.frame('column' = NA, 
                      'dtype' = NA, 
                      'null_count' = NA)

# loop through each variable
for (i in colnames(vars)) {
  
  # count number of null values
  null_count <- sum(is.na(vars[i]))
  
  # keep track of variables with nulls
  if (null_count > 0) {
    
    new_null = data.frame('column' = i, 
                          'dtype' = class(vars[i][1,]),
                          'null_count' = null_count)
    
    null_df <- bind_rows(null_df, new_null)
    
  }

}

null_df <- filter(null_df, !is.na(column))

null_df

##          column     dtype null_count
## 1   LotFrontage   integer        259
## 2         Alley character       1369
## 3    MasVnrType character          8
## 4    MasVnrArea   integer          8
## 5      BsmtQual character         37
## 6      BsmtCond character         37
## 7  BsmtExposure character         38
## 8  BsmtFinType1 character         37
## 9  BsmtFinType2 character         38
## 10   Electrical character          1
## 11  FireplaceQu character        690
## 12   GarageType character         81
## 13  GarageYrBlt   integer         81
## 14 GarageFinish character         81
## 15   GarageQual character         81
## 16   GarageCond character         81
## 17       PoolQC character       1453
## 18        Fence character       1179
## 19  MiscFeature character       1406

Mostly empty fields that will be excluded:
- Alley
- FireplaceQu
- PoolQC
- Fence
- MiscFeature
LotFrontage would possibly be colinear with total lot size or LotArea, so we can exclude that too
Numeric fields:
- MasVnrArea
- GarageYrBlt
Categorical Fields:
- MasVnrType
- BsmtQual
- BsmtCond
- BsmtExposure
- BsmtFinType1
- BsmtFinType2
- Electrical
- GarageType
- GarageFinish
- GarageQual
- GarageCond

### Remove variables ###
vars <- dplyr::select(vars, -Alley, -FireplaceQu, -PoolQC, -Fence, -MiscFeature, -LotFrontage)


### Numeric ###

# Masonry Veneer Area
ggplot(vars, aes(x = MasVnrArea)) +
  geom_density()

## Warning: Removed 8 rows containing non-finite values (stat_density).

# Since the distribution seems to show mostly zero's, use zeros for the null values
vars$MasVnrArea[is.na(vars$MasVnrArea)] <- 0

# Garage Year Built
ggplot(vars, aes(x = GarageYrBlt)) +
  geom_density() +
  facet_wrap(~ Neighborhood, scales='free_y')

## Warning: Removed 81 rows containing non-finite values (stat_density).

# Looks like they're mostly newer, but the years are pretty spread apart

# Garage Year Built by Neighborhood
ggplot(vars, aes(x = GarageYrBlt)) +
  geom_density() +
  facet_wrap(~ Neighborhood, scales='free_y')

## Warning: Removed 81 rows containing non-finite values (stat_density).

# Garage median year built by neighborhood
g_year <- group_by(vars, Neighborhood) %>%
  summarize(med_year = median(GarageYrBlt, na.rm = T))

# join in median garage build year by neighborhood
vars <- inner_join(vars, g_year, by = c('Neighborhood' = 'Neighborhood'))  

# fill in if null
vars$GarageYrBlt <- ifelse(is.na(vars$GarageYrBlt), vars$med_year, vars$GarageYrBlt)

# remove median vals
vars$med_year <- NULL


### Categorical Variables ###

cat_vars <- filter(null_df, dtype == 'character', null_count < 200)

for (i in 1:nrow(cat_vars)) {
  
  col_name <- cat_vars$column[i]
  
  print(col_name)
  
  print(table(vars[col_name]))
  
  print('-------------------------------')
}

## [1] "MasVnrType"
## 
##  BrkCmn BrkFace    None   Stone 
##      15     445     864     128 
## [1] "-------------------------------"
## [1] "BsmtQual"
## 
##  Ex  Fa  Gd  TA 
## 121  35 618 649 
## [1] "-------------------------------"
## [1] "BsmtCond"
## 
##   Fa   Gd   Po   TA 
##   45   65    2 1311 
## [1] "-------------------------------"
## [1] "BsmtExposure"
## 
##  Av  Gd  Mn  No 
## 221 134 114 953 
## [1] "-------------------------------"
## [1] "BsmtFinType1"
## 
## ALQ BLQ GLQ LwQ Rec Unf 
## 220 148 418  74 133 430 
## [1] "-------------------------------"
## [1] "BsmtFinType2"
## 
##  ALQ  BLQ  GLQ  LwQ  Rec  Unf 
##   19   33   14   46   54 1256 
## [1] "-------------------------------"
## [1] "Electrical"
## 
## FuseA FuseF FuseP   Mix SBrkr 
##    94    27     3     1  1334 
## [1] "-------------------------------"
## [1] "GarageType"
## 
##  2Types  Attchd Basment BuiltIn CarPort  Detchd 
##       6     870      19      88       9     387 
## [1] "-------------------------------"
## [1] "GarageFinish"
## 
## Fin RFn Unf 
## 352 422 605 
## [1] "-------------------------------"
## [1] "GarageQual"
## 
##   Ex   Fa   Gd   Po   TA 
##    3   48   14    3 1311 
## [1] "-------------------------------"
## [1] "GarageCond"
## 
##   Ex   Fa   Gd   Po   TA 
##    2   35    9    7 1326 
## [1] "-------------------------------"

# Let's try using the most common value for each category

for (i in 1:nrow(cat_vars)) {
  
  col_name <- cat_vars$column[i]
  
  common_val <- sort(table(vars[col_name]),decreasing=TRUE)[1]
  
  col_index <- grep(col_name, colnames(vars))
  
  vars[is.na(vars[col_name]), col_index] <- common_val
}

Linear Model

lm <- lm(SalePrice ~ LotArea + LandSlope + Neighborhood + Condition2 + OverallQual + 
           OverallCond + YearBuilt + RoofMatl + ExterQual + BsmtQual + BsmtExposure + 
           BsmtFinSF1 + BsmtUnfSF, data = vars)

summary(lm)

## 
## Call:
## lm(formula = SalePrice ~ LotArea + LandSlope + Neighborhood + 
##     Condition2 + OverallQual + OverallCond + YearBuilt + RoofMatl + 
##     ExterQual + BsmtQual + BsmtExposure + BsmtFinSF1 + BsmtUnfSF, 
##     data = vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -144397  -17986   -1113   15733  285898 
## 
## Coefficients:
##                        Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)         -805962.318  156246.542  -5.158    0.000000284957710 ***
## LotArea                   1.240       0.123  10.077 < 0.0000000000000002 ***
## LandSlopeMod           8738.560    4729.378   1.848             0.064854 .  
## LandSlopeSev         -50319.474   13404.986  -3.754             0.000181 ***
## NeighborhoodBlueste   18014.292   25668.532   0.702             0.482918    
## NeighborhoodBrDale   -15807.648   12537.351  -1.261             0.207575    
## NeighborhoodBrkSide    6747.809   10750.397   0.628             0.530316    
## NeighborhoodClearCr   20170.070   11459.940   1.760             0.078617 .  
## NeighborhoodCollgCr   14168.202    8810.636   1.608             0.108043    
## NeighborhoodCrawfor   38695.462   10589.474   3.654             0.000268 ***
## NeighborhoodEdwards   -1804.655    9805.376  -0.184             0.854003    
## NeighborhoodGilbert   23731.450    9321.967   2.546             0.011010 *  
## NeighborhoodIDOTRR    -8002.118   11385.900  -0.703             0.482291    
## NeighborhoodMeadowV    1010.956   12299.045   0.082             0.934501    
## NeighborhoodMitchel   -8573.257   10119.705  -0.847             0.397037    
## NeighborhoodNAmes      3196.010    9475.356   0.337             0.735943    
## NeighborhoodNoRidge   91739.139   10052.846   9.126 < 0.0000000000000002 ***
## NeighborhoodNPkVill    3458.390   14414.632   0.240             0.810426    
## NeighborhoodNridgHt   38390.535    9498.085   4.042    0.000055894512854 ***
## NeighborhoodNWAmes    17627.722    9736.884   1.810             0.070447 .  
## NeighborhoodOldTown    2288.379   10454.342   0.219             0.826765    
## NeighborhoodSawyer     3738.988   10036.073   0.373             0.709536    
## NeighborhoodSawyerW   16567.572    9638.559   1.719             0.085856 .  
## NeighborhoodSomerst   23829.036    9127.862   2.611             0.009135 ** 
## NeighborhoodStoneBr   52838.148   10903.479   4.846    0.000001399894364 ***
## NeighborhoodSWISU     16582.517   11983.851   1.384             0.166659    
## NeighborhoodTimber     8664.139   10296.653   0.841             0.400238    
## NeighborhoodVeenker   23904.216   13523.648   1.768             0.077348 .  
## Condition2Feedr        6143.810   28028.531   0.219             0.826527    
## Condition2Norm        10878.603   24458.518   0.445             0.656549    
## Condition2PosA        36080.454   42723.494   0.845             0.398528    
## Condition2PosN      -157478.229   34870.361  -4.516    0.000006827123873 ***
## Condition2RRAe        54728.829   42072.677   1.301             0.193535    
## Condition2RRAn       -18251.610   42098.133  -0.434             0.664683    
## Condition2RRNn        10326.222   34238.009   0.302             0.763001    
## OverallQual           21318.390    1175.109  18.142 < 0.0000000000000002 ***
## OverallCond            5158.390     933.864   5.524    0.000000039511117 ***
## YearBuilt               169.400      74.182   2.284             0.022546 *  
## RoofMatlCompShg      486792.734   38066.770  12.788 < 0.0000000000000002 ***
## RoofMatlMembran      552304.095   53694.331  10.286 < 0.0000000000000002 ***
## RoofMatlMetal        509105.242   53402.117   9.533 < 0.0000000000000002 ***
## RoofMatlRoll         500049.521   51152.717   9.776 < 0.0000000000000002 ***
## RoofMatlTar&Grv      500217.877   39671.688  12.609 < 0.0000000000000002 ***
## RoofMatlWdShake      518320.505   41396.199  12.521 < 0.0000000000000002 ***
## RoofMatlWdShngl      552048.933   39868.098  13.847 < 0.0000000000000002 ***
## ExterQualFa          -43737.551   12147.172  -3.601             0.000329 ***
## ExterQualGd          -40157.307    6187.553  -6.490    0.000000000118766 ***
## ExterQualTA          -51230.103    6890.959  -7.434    0.000000000000182 ***
## BsmtQualEx            11670.879   35005.639   0.333             0.738882    
## BsmtQualFa           -26476.253   35068.193  -0.755             0.450380    
## BsmtQualGd           -31400.117   34668.318  -0.906             0.365235    
## BsmtQualTA           -28064.718   34587.717  -0.811             0.417270    
## BsmtExposureAv         4693.043   34061.947   0.138             0.890434    
## BsmtExposureGd        19308.985   34210.245   0.564             0.572558    
## BsmtExposureMn        11497.188   34162.766   0.337             0.736513    
## BsmtExposureNo         1617.352   34038.625   0.048             0.962109    
## BsmtFinSF1               45.054       3.394  13.276 < 0.0000000000000002 ***
## BsmtUnfSF                28.476       3.175   8.970 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33870 on 1402 degrees of freedom
## Multiple R-squared:  0.8253, Adjusted R-squared:  0.8182 
## F-statistic: 116.2 on 57 and 1402 DF,  p-value: < 0.00000000000000022

vars$pred <- predict(lm, newdata = vars)

ggplot(vars, aes(x = pred/100000, y = SalePrice/100000)) +
  geom_point(alpha = .3) +
  labs(x = 'Prediction (100k)',
       y = 'Sale Price (100k)') +
  geom_smooth(method = 'lm') +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

Clean up test data

# create an empty dataframe to track null variables
null_df <- data.frame('column' = NA, 
                      'dtype' = NA, 
                      'null_count' = NA)

# loop through each variable
for (i in colnames(test)) {
  
  # count number of null values
  null_count <- sum(is.na(test[i]))
  
  # keep track of variables with nulls
  if (null_count > 0) {
    
    new_null = data.frame('column' = i, 
                          'dtype' = class(test[i][1,]),
                          'null_count' = null_count)
    
    null_df <- bind_rows(null_df, new_null)
    
  }

}

null_df <- filter(null_df, !is.na(column))

null_df

##          column     dtype null_count
## 1      MSZoning character          4
## 2   LotFrontage   integer        227
## 3         Alley character       1352
## 4     Utilities character          2
## 5   Exterior1st character          1
## 6   Exterior2nd character          1
## 7    MasVnrType character         16
## 8    MasVnrArea   integer         15
## 9      BsmtQual character         44
## 10     BsmtCond character         45
## 11 BsmtExposure character         44
## 12 BsmtFinType1 character         42
## 13   BsmtFinSF1   integer          1
## 14 BsmtFinType2 character         42
## 15   BsmtFinSF2   integer          1
## 16    BsmtUnfSF   integer          1
## 17  TotalBsmtSF   integer          1
## 18 BsmtFullBath   integer          2
## 19 BsmtHalfBath   integer          2
## 20  KitchenQual character          1
## 21   Functional character          2
## 22  FireplaceQu character        730
## 23   GarageType character         76
## 24  GarageYrBlt   integer         78
## 25 GarageFinish character         78
## 26   GarageCars   integer          1
## 27   GarageArea   integer          1
## 28   GarageQual character         78
## 29   GarageCond character         78
## 30       PoolQC character       1456
## 31        Fence character       1169
## 32  MiscFeature character       1408
## 33     SaleType character          1

# Check with predictor variables have nulls and fill them in
# LotArea
# LandSlope
# LandSlope
# Condition2
# OverallQual
# OverallCond
# YearBuilt
# RoofMatl 
# ExterQual
# BsmtQual -- check
# BsmtExposure -- check
# BsmtFinSF1 -- check
# BsmtUnfSF -- check

# basement quality
table(test$BsmtQual)

## 
##  Ex  Fa  Gd  TA 
## 137  53 591 634

test$BsmtQual[is.na(test$BsmtQual)] <- 'Gd'

# basement exposure
table(test$BsmtExposure)

## 
##  Av  Gd  Mn  No 
## 197 142 125 951

test$BsmtExposure[is.na(test$BsmtExposure)] <- 'No'

# basement finished square feet
test$BsmtFinSF1[is.na(test$BsmtFinSF1)] <- mean(train$BsmtFinSF1, na.rm = T)

# basement unfinished square feet
test$BsmtUnfSF[is.na(test$BsmtUnfSF)] <- mean(train$BsmtUnfSF, na.rm = T)

Creating Predictions With Test Data

test$SalePrice <- predict(lm, newdata = test)

test_results <- dplyr::select(test, Id, SalePrice)

# getwd()

write.csv(test_results, 'ly_test_results_20201209.csv', row.names = F)

Kaggle Submission

username: dataconsumer101

score: 0.21385