library(tidyverse)
library(ggcorrplot)
library(matrixcalc)
library(MASS)
library(stats)
# set seed so I get same numbers each time
set.seed(365)
# code known values, choose 8 as my n
N <- 8
n <- 10000
sigma <- (N + 1)/2
mu <- sigma
# create dataframe
df <- data.frame(X = runif(n, min = 1, max = N), Y = rnorm(n, mean = mu, sd = sigma))
# preview data
head(df, 10)
hist(df$X, col = 'darkseagreen4', breaks = 8)
hist(df$Y, col = 'darkseagreen4', breaks = 15)
x = quantile(df$X, 0.5)
y = quantile(df$Y, 0.25)
The probability that X is greater than its median given that X is greater than Q1 of Y is 53.8% in my case.
prob_Xx_Xy = df %>%
filter(X > x, X > y) %>%
nrow()/n
prob_Xy = df %>%
filter(X > y) %>%
nrow()/n
prob1 = prob_Xx_Xy/prob_Xy
round(prob1, 3)
## [1] 0.538
The probability that X is grater than all possible x and Y is greater than all possible y is 37.6% in my case.
prob_Xx_Yy <- df %>%
filter(X > x, Y > y) %>%
nrow()/n
round(prob_Xx_Yy, 3)
## [1] 0.376
The probability of X being greater than its median and greater than Q1 of Y is 46.2% in my case.
prob_xX_Xy = df %>%
filter(X < x, X > y) %>%
nrow()/n
prob_Xy = df %>%
filter(X > y) %>%
nrow()/n
prob3 = prob_xX_Xy/prob_Xy
round(prob3, 3)
## [1] 0.462
From the 4 tables below it appears that \(P(X>x and Y>y) = 0.3762\) and \(P(X>x)P(Y>y) = (0.5000)*(0.75) = 0.375\). With a difference of only 0.0012 I think it is safe to say that \(P(X>x and Y>y)=P(X>x)P(Y>y)\).
# joint probability
jointprob_AB = df %>%
mutate(A = ifelse(X > x, "X > x", "X < x"), B = ifelse(Y > y, " Y > y", " Y < y")) %>%
group_by(A, B) %>% summarise(count = n()) %>%
mutate(probability = count/n)
jointprob_AB
# marginal probabilities
margprob_A = jointprob_AB %>%
ungroup() %>%
group_by(A) %>%
summarize(count = sum(count), probability = sum(probability))
margprob_B = jointprob_AB %>%
ungroup() %>%
group_by(B) %>%
summarize(count = sum(count), probability = sum(probability))
margprob_A
margprob_B
mytab <- bind_rows(jointprob_AB, margprob_A, margprob_B) %>%
dplyr::select(-count) %>%
spread(A, probability)
mytab$B[is.na(mytab$B)] <- "Sum"
mytab <- mytab %>%
rename(Event = 'B')
names(mytab)[4] <- "Sum"
mytab[,4][is.na(mytab[,4])] <- 1
mytab
The difference between the two tests is that the Fischer Test is an ‘exact test’ which makes it more accurate for smaller sets of data than the Chi Square Test which should be avoided with small data sets. This is because the Chi Square Test is less accurate when an expected value in a cell is less than 5 - which can happen with a small dataset. In our case, either perform similarly since we have 10,000 observations.
Both tests result in a p-value of 0.5953 which means we cannot reject the null hypothesis and the independence holds for variables X and Y.
# put df into table format
dftable <- table(df$X > x, df$Y>y)
# Chi Square Test
chisq.test(dftable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dftable
## X-squared = 0.28213, df = 1, p-value = 0.5953
# Fisher Test
fisher.test(dftable)
##
## Fisher's Exact Test for Count Data
##
## data: dftable
## p-value = 0.5953
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9361461 1.1243231
## sample estimates:
## odds ratio
## 1.025936
# loading in train and test datasets from my github
trainset <- read.csv("Supplements/train.csv", header=T, stringsAsFactors = F)
testset <- read.csv("Supplements/test.csv", header=T, stringsAsFactors = F)
# taking a first look
head(trainset, 15)
summary(trainset)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
I chose to look at the independent variables of Lot Area, Overall Quality, Year Built, Above Ground Square Feet, and Enclosed Porch Square Feet to see their relationships with Sale Price. I first look at a scatterplot matrix which shows some positive correlations in many of the variables. I then plot a correlation matrix and see strong positive correlations for Overall Quality, Above Ground Square Feet, and Year Built. The only negative correlation is Enclosed Porch though it isn’t too strong.
# datrame with only target variable and a handful of quant variables to look at
corr_df <- trainset %>%
dplyr::select("SalePrice", "LotArea", "OverallQual", "YearBuilt", "GrLivArea", "EnclosedPorch") %>%
replace(is.na(.),0)
# doing a scatterplot matrix
pairs(corr_df)
# plotting correlations
corr_data <- cor(corr_df)
round(corr_data, 3)
## SalePrice LotArea OverallQual YearBuilt GrLivArea EnclosedPorch
## SalePrice 1.000 0.264 0.791 0.523 0.709 -0.129
## LotArea 0.264 1.000 0.106 0.014 0.263 -0.018
## OverallQual 0.791 0.106 1.000 0.572 0.593 -0.114
## YearBuilt 0.523 0.014 0.572 1.000 0.199 -0.387
## GrLivArea 0.709 0.263 0.593 0.199 1.000 0.009
## EnclosedPorch -0.129 -0.018 -0.114 -0.387 0.009 1.000
ggcorrplot(corr_data, method = "circle", type = "upper",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726")) +
ggtitle("Correlation SalePrice and Other Numeric Factors")
I performed the correlation tests for the 4/5 independent variables above, excluding the one that was only slightly negative (Enclosed Porch Square Feet). In each of these cases the p-value are nearly zero and we can reject the hypothesis that the correlation between each variable and the SalePrice is 0. There are significant correlations in all of these cases, or varying strengths. The Overall Quality has the strongest positive correlation while the Lot Size has the weakest positive correlation.
Familywise Error is the probability of coming to at least one false conclusion in a series of hypothesis tests, the probability of making at least one Type I Error. This is mostly a concern when someone is running a very large amount of hypothesis tests - such as all 100 variables you have in a dataset. This greatly increases the chance of finding a significant correlation just be chance. In my case, I’ve chosen a handful of variables due to my own knowledge about House Sale Prices and the correlations here make sense. If we increase the confidence level higher than 80%, the correlations all hold true still as well.
# running the correlations
cor.test(trainset$YearBuilt, trainset$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: trainset$YearBuilt and trainset$SalePrice
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.4980766 0.5468619
## sample estimates:
## cor
## 0.5228973
cor.test(trainset$OverallQual, trainset$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: trainset$OverallQual and trainset$SalePrice
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.7780752 0.8032204
## sample estimates:
## cor
## 0.7909816
cor.test(trainset$GrLivArea, trainset$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: trainset$GrLivArea and trainset$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6915087 0.7249450
## sample estimates:
## cor
## 0.7086245
cor.test(trainset$LotArea, trainset$SalePrice, conf.level = 0.8)
##
## Pearson's product-moment correlation
##
## data: trainset$LotArea and trainset$SalePrice
## t = 10.445, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.2323391 0.2947946
## sample estimates:
## cor
## 0.2638434
# graphing the strongest correlation
trainset$OverallQual.f <-as.factor(as.character(trainset$OverallQual))
ggplot(trainset, aes(x=OverallQual, y=SalePrice, fill=OverallQual.f)) +
geom_boxplot() +
ggtitle("Sale Price by Overall Quality Score") +
theme(legend.position = "none") +
scale_x_discrete(limits = trainset$OverallQual)
# create precision matrix
prec_mx <- solve(corr_data)
prec_mx
## SalePrice LotArea OverallQual YearBuilt GrLivArea
## SalePrice 3.99506463 -0.45896382 -1.7901090 -0.7500217 -1.49965357
## LotArea -0.45896382 1.13301045 0.2797899 0.1104850 -0.16113084
## OverallQual -1.79010902 0.27978989 3.1411243 -0.8389518 -0.49914398
## YearBuilt -0.75002173 0.11048503 -0.8389518 1.9707012 0.60255574
## GrLivArea -1.49965357 -0.16113084 -0.4991440 0.6025557 2.28153435
## EnclosedPorch 0.02450799 0.03790057 -0.1874976 0.5677004 -0.04009006
## EnclosedPorch
## SalePrice 0.02450799
## LotArea 0.03790057
## OverallQual -0.18749765
## YearBuilt 0.56770036
## GrLivArea -0.04009006
## EnclosedPorch 1.20270079
# multiply correlation matrix by precision matrix
corr_prec_mx <- corr_data %*% prec_mx
corr_prec_mx
## SalePrice LotArea OverallQual YearBuilt
## SalePrice 1.000000e+00 -2.602085e-17 5.898060e-17 -1.249001e-16
## LotArea 1.852359e-16 1.000000e+00 4.683753e-17 -2.949030e-17
## OverallQual 3.816392e-17 1.040834e-17 1.000000e+00 -8.326673e-17
## YearBuilt 5.499073e-16 -1.908196e-17 4.302114e-16 1.000000e+00
## GrLivArea 1.137870e-16 -6.472687e-17 1.173107e-16 1.387779e-17
## EnclosedPorch 4.163336e-17 -6.938894e-18 2.775558e-17 -2.220446e-16
## GrLivArea EnclosedPorch
## SalePrice -1.734723e-18 0.000000e+00
## LotArea -3.230922e-17 6.938894e-18
## OverallQual 2.359224e-16 -2.775558e-17
## YearBuilt 3.989864e-17 -1.665335e-16
## GrLivArea 1.000000e+00 1.734723e-18
## EnclosedPorch -4.163336e-17 1.000000e+00
# conduct LU Decomposition on that resulting matrix product
lud_mx <- lu.decomposition(corr_prec_mx)
lud_mx
## $L
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [2,] 1.852359e-16 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [3,] 3.816392e-17 1.040834e-17 1.000000e+00 0.000000e+00 0.000000e+00 0
## [4,] 5.499073e-16 -1.908196e-17 4.302114e-16 1.000000e+00 0.000000e+00 0
## [5,] 1.137870e-16 -6.472687e-17 1.173107e-16 1.387779e-17 1.000000e+00 0
## [6,] 4.163336e-17 -6.938894e-18 2.775558e-17 -2.220446e-16 -4.163336e-17 1
##
## $U
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 -2.602085e-17 5.898060e-17 -1.249001e-16 -1.734723e-18 0.000000e+00
## [2,] 0 1.000000e+00 4.683753e-17 -2.949030e-17 -3.230922e-17 6.938894e-18
## [3,] 0 0.000000e+00 1.000000e+00 -8.326673e-17 2.359224e-16 -2.775558e-17
## [4,] 0 0.000000e+00 0.000000e+00 1.000000e+00 3.989864e-17 -1.665335e-16
## [5,] 0 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.734723e-18
## [6,] 0 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00
I chose to look at the Open Porch Square Feet variable as it is right-skewed. The minimum value is 0 (no open porch) so we do not need to shift anything.
hist(trainset$OpenPorchSF,breaks = 50)
I fit the exponential probability density function and find the optimal value of lambda below.
# exponential probability density function
porch_exp <- fitdistr(trainset$OpenPorchSF, densfun = "exponential")
porch_exp
## rate
## 0.0214315073
## (0.0005608882)
# finding lambda, which should be 1 / lambda
porch_lambda <- porch_exp$estimate
optimal_val <- 1/porch_lambda
optimal_val
## rate
## 46.66027
# taking 1000 samples from the distribution using optimal lambda
exp_prob_samps <- rexp(1000, porch_lambda)
The historgram below shows that the exponential model has shifted the mass of data points to the right a little, with less values at 0. There are less extreme values in the modeled databoth at the high and low end of the square footage.
# graphing difference between real data and exponential modeled data
openporch <- trainset$OpenPorchSF
tibble(openporch) %>%
ggplot(aes(x=openporch)) +
geom_histogram(aes(y = ..density..,fill = "Observed"),bins = 35, alpha = 0.4) +
geom_histogram(data = tibble(exp_prob_samps),
aes(x = exp_prob_samps, y =..density..,fill="Exponential"),
bins = 50, alpha = 0.4) +
labs(title = "Distribution of Open Porch Square Feet",
subtitle = "Observed Data vs. Exponential Model",
x="Square Feet",
y="Proportion")
The 5th to 95th percentiles of the exponential model reach from 2.4 to 139.8 which is a much smaller spread than in the observed data which is from 0 to 175.1. The observed datta has more extreme observations than the exponential distribution.
# exponential
round(qexp(c(0.05,0.95), porch_lambda), 1)
## [1] 2.4 139.8
# observed data
quantile(openporch,probs=c(0.05,0.95))
## 5% 95%
## 0.00 175.05
Calculating the sample mean, assuming normality, with a 95% confidence interval yields a range of 43.81 - 49.51.
# calculated sample mean
z <- qnorm(0.95)
m <- z * sd(openporch)/sqrt(length(openporch))
paste(round(mean(openporch - m),2)," - ",
round(mean(openporch + m),2),sep='')
## [1] "43.81 - 49.51"
Calculated above, we found the 5th and 95th percentiles to be 0 and 175.1, respectively. The exponential model in this case is not a great fit. There are so many entries with 0 square feet in the observed data, and a handful of homes with exceptionally large square footage of an open porch, that the exponential model was simple too tight a range to accurately represent the observed data.
Leaning on some of the work I did earlier, I started with a model using the 3 variables that had high correlations: Overall Quality, Year Built, and Above Ground Square Feet. This results in a Adjusted R-squared that means we can say 73.68% of the variance in home prices can be predicted from just these three variables. This is an excellent starting point!
trainset <- mutate_if(trainset, is_character, factor)
lm1 <- lm(SalePrice ~
OverallQual +
YearBuilt +
GrLivArea
, data = trainset)
summary(lm1)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + YearBuilt + GrLivArea,
## data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -393773 -22639 -2424 18437 290554
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.053e+06 8.376e+04 -12.57 <2e-16 ***
## OverallQual 2.520e+04 1.172e+03 21.50 <2e-16 ***
## YearBuilt 5.001e+02 4.409e+01 11.34 <2e-16 ***
## GrLivArea 6.209e+01 2.581e+00 24.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40750 on 1456 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.7368
## F-statistic: 1363 on 3 and 1456 DF, p-value: < 2.2e-16
Next I attempt to add any further variables to increase our Adjusted R-Squared. I tried many that only made minimal increases, if any, to the Adjusted R-Squared and decided to only keep the addition of Kitchen Quality, Lot Area, and Total Basement Square Feet. Once I had these 6 variables, I even reran the model 6 more times, just selecting 5/6 variables each time to ensure each variable was truly adding to the model. These 6 variables make intuitive sense, high quality homes, specifically kitchen, and large living space is often desirable, and the Year Built ensure many of these variables. The Adjusted R-Squared on this model means we can explain 78.9% of the variance in sale prices with these 6 variables. Further, the readout shows all variables are statistically significant with tiny p-values.
lm2 <- lm(SalePrice ~
YearBuilt +
OverallQual +
KitchenQual +
LotArea +
GrLivArea +
TotalBsmtSF
, data = trainset)
summary(lm2)
##
## Call:
## lm(formula = SalePrice ~ YearBuilt + OverallQual + KitchenQual +
## LotArea + GrLivArea + TotalBsmtSF, data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -520852 -17183 -812 14747 260166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.845e+05 8.159e+04 -9.615 < 2e-16 ***
## YearBuilt 4.073e+02 4.222e+01 9.646 < 2e-16 ***
## OverallQual 1.792e+04 1.178e+03 15.222 < 2e-16 ***
## KitchenQualFa -6.271e+04 7.887e+03 -7.950 3.71e-15 ***
## KitchenQualGd -5.165e+04 4.265e+03 -12.111 < 2e-16 ***
## KitchenQualTA -6.374e+04 4.852e+03 -13.138 < 2e-16 ***
## LotArea 7.445e-01 1.017e-01 7.320 4.10e-13 ***
## GrLivArea 5.035e+01 2.434e+00 20.686 < 2e-16 ***
## TotalBsmtSF 2.237e+01 2.777e+00 8.056 1.63e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36490 on 1451 degrees of freedom
## Multiple R-squared: 0.7902, Adjusted R-squared: 0.789
## F-statistic: 683.1 on 8 and 1451 DF, p-value: < 2.2e-16
Checking the residuals plot below, we see a good amount of scatter around 0 with a bit more flaring on the right-hand side.
plot(fitted(lm2), resid(lm2))
Taking a further look at the QQ Plot, it confirms that extremely high values are not fit as well by the model, but the mid-section of the plot looks fantastic.
qqnorm(resid(lm2))
qqline(resid(lm2), col = "seagreen4")
I test the model built on the training dataset on the test dataset.
test_df <- testset %>%
dplyr::select(YearBuilt, OverallQual, KitchenQual, LotArea, GrLivArea, TotalBsmtSF, Id)
pred <- predict(lm2, newdata = test_df, type="response")
pred <- as.data.frame(pred)
pred_df <- data.frame(Id = test_df$Id, SalePrice = pred$pred)
write.csv(pred_df, file = "rgreenlee_price_prediction.csv", row.names = FALSE)
After submitting my predictions to Kaggle I got a score of 0.18714 and a rank of 7892.
Username: rachel-greenlee
Screenshot of Kaggle Entry