options(scipen = 100,
dplyr.summarise.inform = FALSE)
set.seed(101)
library(dplyr)
library(ggplot2)
library(tidyr)
library(GGally)
library(ggridges)
library(car)
library(MASS)
library(dummies)
Using R, generate a random variable X that has 10,000 random uniform numbers from 1 to N, where N can be any number of your choosing greater than or equal to 6. Then generate a random variable Y that has 10,000 random normal numbers with a mean of \(\mu = \sigma = (N+1)/2\).
n <- 605
# create a vector for all integers between 1 and N
nlist <- seq(1,n,1)
X <- sample(nlist, 10000, replace = T)
# mu = sigma = (n+1)/2
ms <- (n + 1) / 2
Y <- rnorm(10000, mean = ms, sd = ms)
Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the median of the X variable, and the small letter “y” is estimated as the 1st quartile of the Y variable. Interpret the meaning of all probabilities.
x <- median(X)
x
## [1] 303
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -840.0 100.2 296.5 304.6 512.5 1657.7
y <- as.numeric(quantile(Y, 0.25))
y
## [1] 100.1909
df <- data.frame(X,Y)
What is the conditional probabilty of X > x given X > y?
df$a <- ifelse(df$X > x & df$X > y, 1, 0)
df$a_condition <- ifelse(df$X > y, 1, 0)
sum(df$a) / sum(df$a_condition)
## [1] 0.5983203
What is the probability of X > x and Y > x?
df$b <- ifelse(df$X > x & df$Y > x, 1, 0)
sum(df$b) / nrow(df)
## [1] 0.2456
What is the conditional probability of X < x given X > y?
df$c <- ifelse(df$X < x & df$X > y, 1, 0)
df$c_condition <- ifelse(df$X > y, 1, 0)
sum(df$c) / sum(df$c_condition)
## [1] 0.3992801
Investigate whether \(P(X>x \space\) and \(\space Y>y) = P(X>x)P(Y>y)\) by building a table and evaluating the marginal and joint probabilities.
df$y_status <- ifelse(df$Y > y, 'Y > y', 'Y <= y')
# X > x
dfx <- filter(df, X > x) %>%
group_by(y_status) %>%
summarize('X > x' = n())
# X < x
dfx2 <- filter(df, X <= x) %>%
group_by(y_status) %>%
summarize('X <= x' = n())
# all X conditions
df_all <- group_by(df, y_status) %>%
summarize('total' = n())
# Combine
df2 <- inner_join(dfx, dfx2, by = c('y_status' = 'y_status')) %>%
inner_join(df_all, by = c('y_status' = 'y_status'))
df2
## # A tibble: 2 x 4
## y_status `X > x` `X <= x` total
## <chr> <int> <int> <int>
## 1 Y <= y 1244 1256 2500
## 2 Y > y 3743 3757 7500
row_totals <- data.frame('y_status'='total',
'X > x' = nrow(filter(df, X > x)),
'X =< x' = nrow(filter(df, X <= x)),
'total' = nrow(df))
colnames(row_totals)[2] <- 'X > x'
colnames(row_totals)[3] <- 'X <= x'
df3 <- bind_rows(df2, row_totals)
df3
## # A tibble: 3 x 4
## y_status `X > x` `X <= x` total
## <chr> <int> <int> <int>
## 1 Y <= y 1244 1256 2500
## 2 Y > y 3743 3757 7500
## 3 total 4987 5013 10000
df4 <- mutate(df3,
`X > x` = `X > x` / nrow(df),
`X <= x` = `X <= x` / nrow(df),
total = total / nrow(df))
df4
## # A tibble: 3 x 4
## y_status `X > x` `X <= x` total
## <chr> <dbl> <dbl> <dbl>
## 1 Y <= y 0.124 0.126 0.25
## 2 Y > y 0.374 0.376 0.75
## 3 total 0.499 0.501 1
\(P(X>x \space\) and \(\space Y>y) = P(X>x)P(Y>y)\)
left <- filter(df4, y_status == 'Y > y')$`X > x`
left
## [1] 0.3743
right <- filter(df4, y_status == 'total')$`X > x` * filter(df4, y_status == 'Y > y')$total
right
## [1] 0.374025
left == right
## [1] FALSE
right - left
## [1] -0.000275
They’re almost equal
Check to see if the independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate?
# visual/plot check
dplyr::select(df, X, Y) %>%
gather(var, val) %>%
ggplot(aes(x = val, fill = var)) +
geom_density(alpha = .5) +
geom_vline(xintercept = c(x), linetype=2, color = 'sienna') +
geom_vline(xintercept = c(y), linetype=2, color = 'forestgreen') +
theme_classic() +
labs(x = element_blank(), y = element_blank()) +
theme(legend.title = element_blank())
# get counts
df_test <- dplyr::select(df2, -y_status, -total)
df_test
## # A tibble: 2 x 2
## `X > x` `X <= x`
## <int> <int>
## 1 1244 1256
## 2 3743 3757
# as matrix
dfm <- data.matrix(df_test)
fisher.test(df_test)
##
## Fisher's Exact Test for Count Data
##
## data: df_test
## p-value = 0.9081
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9071551 1.0894989
## sample estimates:
## odds ratio
## 0.9941506
chisq.test(df_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df_test
## X-squared = 0.0108, df = 1, p-value = 0.9172
If we check the plot, looking only at the right side of both lines, there’s considerable overlap. Visually, since there’s so much overlap, they’re not independent.
Looking at both Fisher’s Exact test and Pearson’s Chi-squared test, neither have low enough p-values to reject the null hypothesis.
Fisher’s exact test is an exact significance test that is used for small contigency tables.
Chi-square test is a signficance test based on the distribution of the squared difference of observed and expected values.
Since our contingency table is 2x2, Fisher’s test is appropriate here.
You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques. I want you to do the following.
train <- read.csv('https://github.com/dataconsumer101/data605/raw/main/train.csv', stringsAsFactors = F)
test <- read.csv('https://github.com/dataconsumer101/data605/raw/main/test.csv', stringsAsFactors = F)
5 points. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot matrix for at least two of the independent variables and the dependent variable. Derive a correlation matrix for any three quantitative variables in the dataset. Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval. Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?
I think one of the first things people look for when buying a house is to narrow down the location. Let’s take a look at the neighborhoods.
ggplot(train, aes(x = SalePrice/1000, y = reorder(Neighborhood, SalePrice))) +
geom_density_ridges() +
theme_bw() +
labs(x = 'Sale Price (Thousands)',
y = 'Neighborhood')
## Picking joint bandwidth of 17.2
ggplot(train, aes(x = SalePrice/1000, y = reorder(Neighborhood, SalePrice))) +
geom_boxplot() +
theme_bw() +
labs(x = 'Sale Price (Thousands)',
y = 'Neighborhood')
First, looking at neighborhoods with narrower distributions or smaller boxes – the houses in those neighborhoods might be and look homogenous. Neighborhoods like SawyerW and the neighborhoods at the top of the two above charts have wider distributions, which suggest there’s all types of properites there. This could mean small and large homes, apartments, or even commercial properties.
Let’s take a look at what type of properties are zoned for each neighorhood.
neighborhood_rank <- train %>%
group_by(Neighborhood) %>%
summarize(avg = mean(SalePrice))
train %>%
group_by(MSZoning, Neighborhood) %>%
summarize(count = n()) %>%
inner_join(neighborhood_rank, by = c('Neighborhood' = 'Neighborhood')) %>%
ggplot(aes(x = reorder(Neighborhood, avg), y = count, fill = MSZoning)) +
geom_col() +
labs(x = 'Neighborhood', y = element_blank()) +
coord_flip()
Starting from the top through the middle, we’re seeing blue, or RL – residential low density. This refers to separate houses on land.
In Somerst, there’s some FV – or Floating Village Residential. This sounds like a closed community and seems to make up the majority of the houses sold in that neighborhood.
SaywerW, which was the neighborhood in the middle with the wide spread has mostly residential low density with a bit of RH, or residential high density, or apartments.
As we get to the lower end of the chart, neighborhoods with the lowest prices, we see more purple, or Residential Medium Density and some red in IDOTTRR, or commerical properties.
Another obvious factor that goes into price consideration for properties is the quality. Let’s take a look at the qualities of each neighborhood.
inner_join(train, neighborhood_rank, by = c('Neighborhood' = 'Neighborhood')) %>%
ggplot(aes(x = OverallCond)) +
geom_histogram(bins=10) +
facet_wrap(~reorder(Neighborhood, -avg), scales = 'free_y') +
theme_bw() +
labs(x = element_blank(), y = element_blank())
Wow, I’m surprised. The highest average sale price starts in the top left, then goes across, to the next row. The top 4 most expensive neighborhoods have mostly average quality properites, and some of the lowest price neighborhoods have a good amount of higher quality properties. Considering the distribution of prices was pretty wide for the most expensive neighborhoods, this could mean that these quality ratings aren’t a strong indicator of price.
As you arrive at a house, the first thing you notice is how big the house is and how much land it covers. Lets see there’s a relationship between the lot size and the sale price.
ggplot(train, aes(x = LotArea/1000, y = SalePrice/1000)) +
geom_point(alpha = .1) +
geom_smooth(method='lm') +
labs(x = 'Lot Area in Thousands',
y = 'Sale Price in Thousands') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
It looks like there could be a linear relationship. Let’s try to spread it out a bit.
# scale x log10
ggplot(train, aes(x = LotArea/1000, y = SalePrice/1000)) +
geom_point(alpha = .1) +
scale_x_log10() +
geom_smooth(method='lm') +
labs(x = 'Lot Area in Thousands',
y = 'Sale Price in Thousands') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# powertransform
pt <- powerTransform(train$LotArea)$lambda
ggplot(train, aes(x = LotArea^pt, y = SalePrice/1000)) +
geom_point(alpha = .1) +
scale_x_log10() +
geom_smooth(method='lm') +
labs(x = 'Lot Area Transformed',
y = 'Sale Price in Thousands') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Once you step into a house, you’re on the first floor and you get the first impression of the house. Let’s see if the size of the first floor has a relationship to the sale price.
ggplot(train, aes(x = X1stFlrSF, y = SalePrice/1000)) +
geom_point(alpha = .1) +
geom_smooth(method='lm') +
labs(x = 'First Floor Square Feet',
y = 'Sale Price in Thousands') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
No need to look any futher, this relationship is pretty obvious.
Let’s test the correlations between these three numerical variables:
LotFrontage – Linear feet of street connected to property
YearBuilt – Original construction date
X2ndFlrSF - Second floor square feet
three <- dplyr::select(train, LotFrontage, YearBuilt, X2ndFlrSF)
cmatrix <- cor(filter(three, !is.na(LotFrontage)))
cmatrix
## LotFrontage YearBuilt X2ndFlrSF
## LotFrontage 1.00000000 0.12334947 0.08017727
## YearBuilt 0.12334947 1.00000000 -0.01786209
## X2ndFlrSF 0.08017727 -0.01786209 1.00000000
ggpairs(three)
# Lot Frontage vs Year Built
cor.test(train$LotFrontage, train$YearBuilt, conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$LotFrontage and train$YearBuilt
## t = 4.304, df = 1199, p-value = 0.00001814
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.08673626 0.15962991
## sample estimates:
## cor
## 0.1233495
It looks like we can be 80% confident that the correlation of the two variables is between roughly 7.6% and 17%. With such a small p value, we can reject the null hypothesis.
# Lot Frontage vs 2nd Floor Square Feet
cor.test(train$LotFrontage, train$X2ndFlrSF, conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$LotFrontage and train$X2ndFlrSF
## t = 2.7852, df = 1199, p-value = 0.005433
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.04329658 0.11683974
## sample estimates:
## cor
## 0.08017727
I initially thought that these two variables would be colinear in some way– but it looks like there’s nearly no correlation here. With an 80% confidence interval, there correlation is between 4.3% and 11.7%. The p-value is low enough again to reject the null hypothesis.
# Year Built vs 2nd Floor Square Feet
cor.test(train$YearBuilt, train$X2ndFlrSF, conf.level = 0.80)
##
## Pearson's product-moment correlation
##
## data: train$YearBuilt and train$X2ndFlrSF
## t = 0.39361, df = 1458, p-value = 0.6939
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## -0.02326203 0.04385413
## sample estimates:
## cor
## 0.01030766
The correlation between these two variables, with an 80% confidence interval, is between -2.3% and 4.4%. Since the p-value is so high, we can’t say that this is correlation is different from zero.
Considering I tried to choose variables that were unrelated and also that we used a 80% confidence interval, I would be concerned about a familywise error for the first two tests. Both intervals were both in the single digits, so if we raised the confidence interval of the tests, we could possibly extend the range to include zero in both cases. However, in both cases, the p-value was small enough to be sure. I also tested 99% confidence intervals to see what would happen, and it didn’t look like the ranges reached far enough to cover zero.
5 points. Linear Algebra and Correlation. Invert your correlation matrix from above. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct LU decomposition on the matrix.
pmatrix <- solve(cmatrix)
pmatrix
## LotFrontage YearBuilt X2ndFlrSF
## LotFrontage 1.0224989 -0.12762977 -0.08426090
## YearBuilt -0.1276298 1.01625009 0.02838536
## X2ndFlrSF -0.0842609 0.02838536 1.00726283
cpmx <- round(cmatrix %*% pmatrix, 4)
cpmx
## LotFrontage YearBuilt X2ndFlrSF
## LotFrontage 1 0 0
## YearBuilt 0 1 0
## X2ndFlrSF 0 0 1
pcmx <- round(pmatrix %*% cmatrix, 4)
pcmx
## LotFrontage YearBuilt X2ndFlrSF
## LotFrontage 1 0 0
## YearBuilt 0 1 0
## X2ndFlrSF 0 0 1
factorize <- function(A) {
# Assuming that the matrix is square
m <- nrow(A)
n <- m
# Identity matrix base
L <- matrix(0, nrow = m, ncol = n)
# add 1 to diagonal
for (i in 1:m) {
L[i,i] <- 1
}
# start with A
U <- A
# loop through rows 2 thru m
for (i in 2:m) {
# loop through columns 1 thru (i - 1)
for (j in 1:(i-1)) {
# determine factor to zero out the spot
f <- (U[i,j] / U[j,j]) * -1
# zero out the spot using identity location and factor
U[i,] <- (f * U[j, ]) + U[i,]
# inverse factor added to L
L[i,j] <- f * -1
}
}
# return list L and U
return(list(L,U))
}
lu <- factorize(cmatrix)
# L
lu[1]
## [[1]]
## [,1] [,2] [,3]
## [1,] 1.00000000 0.00000000 0
## [2,] 0.12334947 1.00000000 0
## [3,] 0.08017727 -0.02818068 1
# U
lu[2]
## [[1]]
## LotFrontage YearBuilt X2ndFlrSF
## LotFrontage 1 0.1233495 0.08017727
## YearBuilt 0 0.9847849 -0.02775191
## X2ndFlrSF 0 0.0000000 0.99278954
5 points. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
Since we saw earlier that lot area is skewed to the right, let’s work with that.
# check skew
ggplot(train, aes(x = LotArea/1000)) +
geom_density() +
labs(title = 'Lot Area Distribution',
y = element_blank(),
x = 'Lot Area Square Feet (Thousands)') +
theme_classic()
# fit distribution
fd <- fitdistr(train$LotArea, densfun = 'exponential')
fd
## rate
## 0.000095085704
## (0.000002488507)
lambda <- fd$estimate
lambda
## rate
## 0.0000950857
lsample <- rexp(1000, lambda)
ls_df <- data.frame('LotArea' = lsample, 'source' = 'lambda sample')
edf <- dplyr::select(train, LotArea) %>%
mutate(source = 'training data') %>%
bind_rows(ls_df)
ggplot(edf, aes(x = LotArea/1000, fill = source)) +
geom_density(alpha = .5) +
labs(title = 'Lot Area Distribution',
y = element_blank(),
x = 'Lot Area Square Feet (Thousands)') +
theme_bw()
The distributions are similiar but the data doesn’t exactly follow an exponential distribution, even though it does have a right skew.
# 5th Percentile
qexp(0.05, lambda)
## [1] 539.4428
# 95th Percentile
qexp(0.95, lambda)
## [1] 31505.6
100 - 95 / 2 = 2.5%
mean <- mean(train$LotArea)
sd <- sd(train$LotArea)
# lower boundary
qnorm(.025, mean, sd)
## [1] -9046.092
# upper boundary
qnorm(.025, mean, sd, lower.tail = F)
## [1] 30079.75
Since the data isn’t actually normally distributed, it doesn’t make sense that the lower bound is negative.
For comparison’s sake, let’s also check the 90% confidence interval assuming a normal distribution
# lower boundary
qnorm(.05, mean, sd)
## [1] -5900.892
# upper boundary
qnorm(.05, mean, sd, lower.tail = F)
## [1] 26934.55
# 5th Percentile
quantile(train$LotArea, .05)
## 5%
## 3311.7
# 95th Percentile
quantile(train$LotArea, .95)
## 95%
## 17401.15
In the data, there’s a sharp rise and a sharp fall, along with a right skew. This is likely related to zoning neighborhoods to be about the same size. Since homes can’t have negative square feet, there’s only possibility for the distribution to skew right, which it does– because of very large properties and possibly commerical properties.
10 points. Modeling. Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
# Remove ID field
vars <- dplyr::select(train, -Id)
# create an empty dataframe to track null variables
null_df <- data.frame('column' = NA,
'dtype' = NA,
'null_count' = NA)
# loop through each variable
for (i in colnames(vars)) {
# count number of null values
null_count <- sum(is.na(vars[i]))
# keep track of variables with nulls
if (null_count > 0) {
new_null = data.frame('column' = i,
'dtype' = class(vars[i][1,]),
'null_count' = null_count)
null_df <- bind_rows(null_df, new_null)
}
}
null_df <- filter(null_df, !is.na(column))
null_df
## column dtype null_count
## 1 LotFrontage integer 259
## 2 Alley character 1369
## 3 MasVnrType character 8
## 4 MasVnrArea integer 8
## 5 BsmtQual character 37
## 6 BsmtCond character 37
## 7 BsmtExposure character 38
## 8 BsmtFinType1 character 37
## 9 BsmtFinType2 character 38
## 10 Electrical character 1
## 11 FireplaceQu character 690
## 12 GarageType character 81
## 13 GarageYrBlt integer 81
## 14 GarageFinish character 81
## 15 GarageQual character 81
## 16 GarageCond character 81
## 17 PoolQC character 1453
## 18 Fence character 1179
## 19 MiscFeature character 1406
Mostly empty fields that will be excluded:
LotFrontage would possibly be colinear with total lot size or LotArea, so we can exclude that too
Numeric fields:
Categorical Fields:
### Remove variables ###
vars <- dplyr::select(vars, -Alley, -FireplaceQu, -PoolQC, -Fence, -MiscFeature, -LotFrontage)
### Numeric ###
# Masonry Veneer Area
ggplot(vars, aes(x = MasVnrArea)) +
geom_density()
## Warning: Removed 8 rows containing non-finite values (stat_density).
# Since the distribution seems to show mostly zero's, use zeros for the null values
vars$MasVnrArea[is.na(vars$MasVnrArea)] <- 0
# Garage Year Built
ggplot(vars, aes(x = GarageYrBlt)) +
geom_density() +
facet_wrap(~ Neighborhood, scales='free_y')
## Warning: Removed 81 rows containing non-finite values (stat_density).
# Looks like they're mostly newer, but the years are pretty spread apart
# Garage Year Built by Neighborhood
ggplot(vars, aes(x = GarageYrBlt)) +
geom_density() +
facet_wrap(~ Neighborhood, scales='free_y')
## Warning: Removed 81 rows containing non-finite values (stat_density).
# Garage median year built by neighborhood
g_year <- group_by(vars, Neighborhood) %>%
summarize(med_year = median(GarageYrBlt, na.rm = T))
# join in median garage build year by neighborhood
vars <- inner_join(vars, g_year, by = c('Neighborhood' = 'Neighborhood'))
# fill in if null
vars$GarageYrBlt <- ifelse(is.na(vars$GarageYrBlt), vars$med_year, vars$GarageYrBlt)
# remove median vals
vars$med_year <- NULL
### Categorical Variables ###
cat_vars <- filter(null_df, dtype == 'character', null_count < 200)
for (i in 1:nrow(cat_vars)) {
col_name <- cat_vars$column[i]
print(col_name)
print(table(vars[col_name]))
print('-------------------------------')
}
## [1] "MasVnrType"
##
## BrkCmn BrkFace None Stone
## 15 445 864 128
## [1] "-------------------------------"
## [1] "BsmtQual"
##
## Ex Fa Gd TA
## 121 35 618 649
## [1] "-------------------------------"
## [1] "BsmtCond"
##
## Fa Gd Po TA
## 45 65 2 1311
## [1] "-------------------------------"
## [1] "BsmtExposure"
##
## Av Gd Mn No
## 221 134 114 953
## [1] "-------------------------------"
## [1] "BsmtFinType1"
##
## ALQ BLQ GLQ LwQ Rec Unf
## 220 148 418 74 133 430
## [1] "-------------------------------"
## [1] "BsmtFinType2"
##
## ALQ BLQ GLQ LwQ Rec Unf
## 19 33 14 46 54 1256
## [1] "-------------------------------"
## [1] "Electrical"
##
## FuseA FuseF FuseP Mix SBrkr
## 94 27 3 1 1334
## [1] "-------------------------------"
## [1] "GarageType"
##
## 2Types Attchd Basment BuiltIn CarPort Detchd
## 6 870 19 88 9 387
## [1] "-------------------------------"
## [1] "GarageFinish"
##
## Fin RFn Unf
## 352 422 605
## [1] "-------------------------------"
## [1] "GarageQual"
##
## Ex Fa Gd Po TA
## 3 48 14 3 1311
## [1] "-------------------------------"
## [1] "GarageCond"
##
## Ex Fa Gd Po TA
## 2 35 9 7 1326
## [1] "-------------------------------"
# Let's try using the most common value for each category
for (i in 1:nrow(cat_vars)) {
col_name <- cat_vars$column[i]
common_val <- sort(table(vars[col_name]),decreasing=TRUE)[1]
col_index <- grep(col_name, colnames(vars))
vars[is.na(vars[col_name]), col_index] <- common_val
}
lm <- lm(SalePrice ~ LotArea + LandSlope + Neighborhood + Condition2 + OverallQual +
OverallCond + YearBuilt + RoofMatl + ExterQual + BsmtQual + BsmtExposure +
BsmtFinSF1 + BsmtUnfSF, data = vars)
summary(lm)
##
## Call:
## lm(formula = SalePrice ~ LotArea + LandSlope + Neighborhood +
## Condition2 + OverallQual + OverallCond + YearBuilt + RoofMatl +
## ExterQual + BsmtQual + BsmtExposure + BsmtFinSF1 + BsmtUnfSF,
## data = vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144397 -17986 -1113 15733 285898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -805962.318 156246.542 -5.158 0.000000284957710 ***
## LotArea 1.240 0.123 10.077 < 0.0000000000000002 ***
## LandSlopeMod 8738.560 4729.378 1.848 0.064854 .
## LandSlopeSev -50319.474 13404.986 -3.754 0.000181 ***
## NeighborhoodBlueste 18014.292 25668.532 0.702 0.482918
## NeighborhoodBrDale -15807.648 12537.351 -1.261 0.207575
## NeighborhoodBrkSide 6747.809 10750.397 0.628 0.530316
## NeighborhoodClearCr 20170.070 11459.940 1.760 0.078617 .
## NeighborhoodCollgCr 14168.202 8810.636 1.608 0.108043
## NeighborhoodCrawfor 38695.462 10589.474 3.654 0.000268 ***
## NeighborhoodEdwards -1804.655 9805.376 -0.184 0.854003
## NeighborhoodGilbert 23731.450 9321.967 2.546 0.011010 *
## NeighborhoodIDOTRR -8002.118 11385.900 -0.703 0.482291
## NeighborhoodMeadowV 1010.956 12299.045 0.082 0.934501
## NeighborhoodMitchel -8573.257 10119.705 -0.847 0.397037
## NeighborhoodNAmes 3196.010 9475.356 0.337 0.735943
## NeighborhoodNoRidge 91739.139 10052.846 9.126 < 0.0000000000000002 ***
## NeighborhoodNPkVill 3458.390 14414.632 0.240 0.810426
## NeighborhoodNridgHt 38390.535 9498.085 4.042 0.000055894512854 ***
## NeighborhoodNWAmes 17627.722 9736.884 1.810 0.070447 .
## NeighborhoodOldTown 2288.379 10454.342 0.219 0.826765
## NeighborhoodSawyer 3738.988 10036.073 0.373 0.709536
## NeighborhoodSawyerW 16567.572 9638.559 1.719 0.085856 .
## NeighborhoodSomerst 23829.036 9127.862 2.611 0.009135 **
## NeighborhoodStoneBr 52838.148 10903.479 4.846 0.000001399894364 ***
## NeighborhoodSWISU 16582.517 11983.851 1.384 0.166659
## NeighborhoodTimber 8664.139 10296.653 0.841 0.400238
## NeighborhoodVeenker 23904.216 13523.648 1.768 0.077348 .
## Condition2Feedr 6143.810 28028.531 0.219 0.826527
## Condition2Norm 10878.603 24458.518 0.445 0.656549
## Condition2PosA 36080.454 42723.494 0.845 0.398528
## Condition2PosN -157478.229 34870.361 -4.516 0.000006827123873 ***
## Condition2RRAe 54728.829 42072.677 1.301 0.193535
## Condition2RRAn -18251.610 42098.133 -0.434 0.664683
## Condition2RRNn 10326.222 34238.009 0.302 0.763001
## OverallQual 21318.390 1175.109 18.142 < 0.0000000000000002 ***
## OverallCond 5158.390 933.864 5.524 0.000000039511117 ***
## YearBuilt 169.400 74.182 2.284 0.022546 *
## RoofMatlCompShg 486792.734 38066.770 12.788 < 0.0000000000000002 ***
## RoofMatlMembran 552304.095 53694.331 10.286 < 0.0000000000000002 ***
## RoofMatlMetal 509105.242 53402.117 9.533 < 0.0000000000000002 ***
## RoofMatlRoll 500049.521 51152.717 9.776 < 0.0000000000000002 ***
## RoofMatlTar&Grv 500217.877 39671.688 12.609 < 0.0000000000000002 ***
## RoofMatlWdShake 518320.505 41396.199 12.521 < 0.0000000000000002 ***
## RoofMatlWdShngl 552048.933 39868.098 13.847 < 0.0000000000000002 ***
## ExterQualFa -43737.551 12147.172 -3.601 0.000329 ***
## ExterQualGd -40157.307 6187.553 -6.490 0.000000000118766 ***
## ExterQualTA -51230.103 6890.959 -7.434 0.000000000000182 ***
## BsmtQualEx 11670.879 35005.639 0.333 0.738882
## BsmtQualFa -26476.253 35068.193 -0.755 0.450380
## BsmtQualGd -31400.117 34668.318 -0.906 0.365235
## BsmtQualTA -28064.718 34587.717 -0.811 0.417270
## BsmtExposureAv 4693.043 34061.947 0.138 0.890434
## BsmtExposureGd 19308.985 34210.245 0.564 0.572558
## BsmtExposureMn 11497.188 34162.766 0.337 0.736513
## BsmtExposureNo 1617.352 34038.625 0.048 0.962109
## BsmtFinSF1 45.054 3.394 13.276 < 0.0000000000000002 ***
## BsmtUnfSF 28.476 3.175 8.970 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33870 on 1402 degrees of freedom
## Multiple R-squared: 0.8253, Adjusted R-squared: 0.8182
## F-statistic: 116.2 on 57 and 1402 DF, p-value: < 0.00000000000000022
vars$pred <- predict(lm, newdata = vars)
ggplot(vars, aes(x = pred/100000, y = SalePrice/100000)) +
geom_point(alpha = .3) +
labs(x = 'Prediction (100k)',
y = 'Sale Price (100k)') +
geom_smooth(method = 'lm') +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# create an empty dataframe to track null variables
null_df <- data.frame('column' = NA,
'dtype' = NA,
'null_count' = NA)
# loop through each variable
for (i in colnames(test)) {
# count number of null values
null_count <- sum(is.na(test[i]))
# keep track of variables with nulls
if (null_count > 0) {
new_null = data.frame('column' = i,
'dtype' = class(test[i][1,]),
'null_count' = null_count)
null_df <- bind_rows(null_df, new_null)
}
}
null_df <- filter(null_df, !is.na(column))
null_df
## column dtype null_count
## 1 MSZoning character 4
## 2 LotFrontage integer 227
## 3 Alley character 1352
## 4 Utilities character 2
## 5 Exterior1st character 1
## 6 Exterior2nd character 1
## 7 MasVnrType character 16
## 8 MasVnrArea integer 15
## 9 BsmtQual character 44
## 10 BsmtCond character 45
## 11 BsmtExposure character 44
## 12 BsmtFinType1 character 42
## 13 BsmtFinSF1 integer 1
## 14 BsmtFinType2 character 42
## 15 BsmtFinSF2 integer 1
## 16 BsmtUnfSF integer 1
## 17 TotalBsmtSF integer 1
## 18 BsmtFullBath integer 2
## 19 BsmtHalfBath integer 2
## 20 KitchenQual character 1
## 21 Functional character 2
## 22 FireplaceQu character 730
## 23 GarageType character 76
## 24 GarageYrBlt integer 78
## 25 GarageFinish character 78
## 26 GarageCars integer 1
## 27 GarageArea integer 1
## 28 GarageQual character 78
## 29 GarageCond character 78
## 30 PoolQC character 1456
## 31 Fence character 1169
## 32 MiscFeature character 1408
## 33 SaleType character 1
# Check with predictor variables have nulls and fill them in
# LotArea
# LandSlope
# LandSlope
# Condition2
# OverallQual
# OverallCond
# YearBuilt
# RoofMatl
# ExterQual
# BsmtQual -- check
# BsmtExposure -- check
# BsmtFinSF1 -- check
# BsmtUnfSF -- check
# basement quality
table(test$BsmtQual)
##
## Ex Fa Gd TA
## 137 53 591 634
test$BsmtQual[is.na(test$BsmtQual)] <- 'Gd'
# basement exposure
table(test$BsmtExposure)
##
## Av Gd Mn No
## 197 142 125 951
test$BsmtExposure[is.na(test$BsmtExposure)] <- 'No'
# basement finished square feet
test$BsmtFinSF1[is.na(test$BsmtFinSF1)] <- mean(train$BsmtFinSF1, na.rm = T)
# basement unfinished square feet
test$BsmtUnfSF[is.na(test$BsmtUnfSF)] <- mean(train$BsmtUnfSF, na.rm = T)
test$SalePrice <- predict(lm, newdata = test)
test_results <- dplyr::select(test, Id, SalePrice)
# getwd()
write.csv(test_results, 'ly_test_results_20201209.csv', row.names = F)
username: dataconsumer101
score: 0.21385