#Load necessary libraries and data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
training_data <- read.csv("https://raw.githubusercontent.com/Marley-Myrianthopoulos/grad_school_data/main/data_605_final_train.csv")
#Define above ground living area as X and sale price (dependent variable) as Y
X <- training_data$GrLivArea
Y <- training_data$SalePrice
#Confirming that there are no NA values in either column
sum(is.na(X))
## [1] 0
sum(is.na(Y))
## [1] 0
To make sure that the variable I selected for X is skewed to the right, I’ll verify that the mean is greater than the median.
#Get the mean and median of X
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
The mean is greater than the median, as desired, suggesting that the variable is skewed to the right. I’ll also inspect the distribution of X visually to make sure.
#Create a density plot of X
training_data %>%
ggplot(aes(x = GrLivArea)) +
geom_density()
We can see from the density plot that the variable GrLivArea is skewed right.
#Defining x as the 3rd quartile of X and y as the 2nd quartile of Y
x <- quantile(X, prob = 0.75)
y <- median(Y)
#Creating a data frame of just the relevant columns
probs_df <- data.frame(X,Y)
#Calculating probabilities
prob_a <- sum(probs_df$X > x & probs_df$Y > y) / sum(probs_df$Y > y)
prob_b <- sum(probs_df$X > x & probs_df$Y > y) / nrow(probs_df)
prob_c <- sum(probs_df$X < x & probs_df$Y > y) / sum(probs_df$Y > y)
probs <- c(prob_a, prob_b, prob_c)
print(probs)
## [1] 0.4326923 0.2157534 0.5673077
\(P(X>x|Y>y)\approx0.43\). This means that about 43% of the houses that have a greater than the median sale price are in the top quarter of above ground living area.
\(P(X>x,Y>y)\approx 0.22\). This means that about 22% of the houses have a sale price that is greater than the median and is in the top quarter of above ground living area.
\(P(X<x|Y>y)\approx0.57\). This means that about 57% of the houses that have a greater than the median sale price are not in the top quarter of above ground living area.
Note that \(P(X>x|Y>y)+P(X<x|Y>y)=1\), which makes sense this accounts for all circumstances where \(Y>y\) except for the case where \(X=x\). However, there are no cases where \(X=x\), as shown below. Therefore, \(P(X>x | Y>y)+P(X<x | Y>y)\) accounts for all cases where \(Y>y\)
xequalsx <- sum(probs_df$X == x)
xequalsx
## [1] 0
#Creating the columns for the table of counts
col_1 <- c(sum(probs_df$X <= x & probs_df$Y <= y), sum(probs_df$X > x & probs_df$Y <= y), sum(probs_df$Y <= y))
col_2 <- c(sum(probs_df$X <= x & probs_df$Y > y), sum(probs_df$X > x & probs_df$Y > y), sum(probs_df$Y >y))
col_3 <- c(sum(probs_df$X <= x), sum(probs_df$X > x), nrow(probs_df))
#Creating the table as a data frame and naming the columns
counts_df <- data.frame(col_1, col_2, col_3)
counts_df <- counts_df %>% rename("<=2d quartile" = col_1, ">2d quartile" = col_2, "Total" = col_3)
row.names(counts_df) <- c("<= 3d quartile", ">3d quartile", "Total")
kable(counts_df, align = "ccc")
<=2d quartile | >2d quartile | Total | |
---|---|---|---|
<= 3d quartile | 682 | 413 | 1095 |
>3d quartile | 50 | 315 | 365 |
Total | 732 | 728 | 1460 |
The values in this data frame are consistent with the ones previously calculated. The overall total of 1,460 is the same as the number of rows in the original data frame (demonstrated below). Additionally, all three probabilities calculated above are consistent with this data. Probability a (\(P(X>x|Y>y)\)) is \(315/728\approx0.43\), probability b (\(P(X>x,Y>y)\)) is \(315/1460\approx0.22\), and probability c (\(P(X<x|Y>y)\)) is \(413/728\approx0.57\), as previously calculated.
#Demonstrating that the original data frame has 1460 rows
dim(training_data)[1]
## [1] 1460
#Calculating Probabilities
p_A <- sum(probs_df$X > x) / nrow(probs_df)
p_B <- sum(probs_df$Y > y) / nrow(probs_df)
p_A_and_B <- sum(probs_df$X > x & probs_df$Y > y) / nrow(probs_df)
p_A_given_B <- p_A_and_B / p_B
p_A_times_p_B <- p_A * p_B
#Comparing probabilities
p_A_given_B
## [1] 0.4326923
p_A_times_p_B
## [1] 0.1246575
\(P(A|B)\neq P(A)P(B)\), which means these variables are not independent.
#Removing the total row and column to prepare for chi-square test
chisq_data <- counts_df[1:2, 1:2]
row.names(chisq_data) <- c("~A", "A")
chisq_data <- chisq_data %>% rename("~B" = 1, "B" = 2)
#Performing a chi-square test
chisq <- chisq.test(chisq_data)
#Printing the results
kable(chisq_data, align = 'cc')
~B | B | |
---|---|---|
~A | 682 | 413 |
A | 50 | 315 |
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chisq_data
## X-squared = 256.53, df = 1, p-value < 2.2e-16
The chi-square value is 256.53 on 1 degree of freedom. If A and B were independent, the probability of getting a chi-square value at least this great would be less than .00000000000000022, far less than the standard threshold of 0.05. We can therefore confidently conclude that A and B are not independent.
The univariate descriptive statistics, histogram, and density plot for X are shown below.
summary(training_data$GrLivArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
training_data %>% ggplot(aes(x = GrLivArea)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
training_data %>% ggplot(aes(x = GrLivArea)) + geom_density()
The univariate descriptive statistics, histogram, and density plot for Y are shown below.
summary(training_data$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
training_data %>% ggplot(aes(x = SalePrice)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
training_data %>% ggplot(aes(x = SalePrice)) + geom_density()
The scatterplot of X and Y is shown below.
ggplot(training_data, aes(x = GrLivArea, y = SalePrice)) + geom_point()
#Creating a data frame with the data split by whether X > x
ttest_data <- data.frame(training_data$GrLivArea, training_data$SalePrice)
ttest_data <- ttest_data %>% rename("GrLivArea" = 1, "SalePrice" = 2) %>% mutate(category = case_when(GrLivArea > x ~ "A", T ~ "~A"))
#Performing a t-test on the difference of means in sale price for X > x and X <= x
ttest <- t.test(SalePrice ~ category, data = ttest_data, conf.level=.95)
#Printing the results
ttest
##
## Welch Two Sample t-test
##
## data: SalePrice by category
## t = -19.111, df = 433.89, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ~A and group A is not equal to 0
## 95 percent confidence interval:
## -112392.64 -91430.31
## sample estimates:
## mean in group ~A mean in group A
## 155443.3 257354.8
There is a 95% probability that the true difference in the mean sale price for houses above the third quartile for GrLivArea and the mean sale price for houses below the third quartile for GrLivArea is between $155443.30 and $257354.80
#Creating a data frame with the total basement square footage, above ground square footage, and sale price in preparation for a correlation matrix
cor_data <- data.frame(TotalBsmtSF = training_data$TotalBsmtSF, GrLivArea = training_data$GrLivArea, SalePrice = training_data$SalePrice)
#Confirming that there is no missing data in the correlation matrix data
sum(is.na(cor_data))
## [1] 0
#Creating and printing the correlation matrix
cor_matrix <- round(cor(cor_data),2)
print(cor_matrix)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1.00 0.45 0.61
## GrLivArea 0.45 1.00 0.71
## SalePrice 0.61 0.71 1.00
#Testing the hypothesis that the correlation is 0 for each combination of variables and printing the results
correlation_1 <- cor.test(cor_data$TotalBsmtSF, cor_data$GrLivArea, conf.level = 0.99)
print(correlation_1)
##
## Pearson's product-moment correlation
##
## data: cor_data$TotalBsmtSF and cor_data$GrLivArea
## t = 19.503, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.3997401 0.5067175
## sample estimates:
## cor
## 0.4548682
correlation_2 <- cor.test(cor_data$TotalBsmtSF, cor_data$SalePrice, conf.level = 0.99)
print(correlation_2)
##
## Pearson's product-moment correlation
##
## data: cor_data$TotalBsmtSF and cor_data$SalePrice
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.5697562 0.6539251
## sample estimates:
## cor
## 0.6135806
correlation_3 <- cor.test(cor_data$GrLivArea, cor_data$SalePrice, conf.level = 0.99)
print(correlation_3)
##
## Pearson's product-moment correlation
##
## data: cor_data$GrLivArea and cor_data$SalePrice
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.6733974 0.7406408
## sample estimates:
## cor
## 0.7086245
The 99% confidence interval for the correlation coefficient between total basement area and above ground living area is 0.400 to 0.507, with a p-value less than 0.001. This means that there is a 99% chance that the true correlation between these variables is between 0.400 and 0.507. This interval does not include 0, and the p-value means that the probability of obtaining an r-value at least this high if the true correlation was 0 is less than 1 in 1000. We can safely reject the null hypothesis (that the correlation between these variables is 0).
The 99% confidence interval for the correlation coefficient between total basement area and sale price is 0.570 to 0.654, with a p-value less than 0.001. This means that there is a 99% chance that the true correlation between these variables is between 0.570 and 0.654. This interval does not include 0, and the p-value means that the probability of obtaining an r-value at least this high if the true correlation was 0 is less than 1 in 1000. We can safely reject the null hypothesis (that the correlation between these variables is 0).
The 99% confidence interval for the correlation coefficient between above ground living area and sale price is 0.673 to 0.741, with a p-value less than 0.001. This means that there is a 99% chance that the true correlation between these variables is between 0.673 and 0.741. This interval does not include 0, and the p-value means that the probability of obtaining an r-value at least this high if the true correlation was 0 is less than 1 in 1000. We can safely reject the null hypothesis (that the correlation between these variables is 0).
#Calculating and printing the precision matrix
precision_matrix <- solve(cor_matrix)
print(precision_matrix)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1.59407245 -0.05432511 -0.9338134
## GrLivArea -0.05432511 2.01838696 -1.3999164
## SalePrice -0.93381337 -1.39991642 2.5635668
round(cor_matrix %*% precision_matrix,2)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
round(precision_matrix %*% cor_matrix,2)
## TotalBsmtSF GrLivArea SalePrice
## TotalBsmtSF 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
Since the precision matrix is the inverse of the correlation matrix, the product of the matrices is the identity matrix.
I will conduct principal component analysis on a subset of 5 of the numerical variables in the training data set: Lot Area, Total Basement Area, Above Ground Living Area, Garage Area, and Sale Price
#Preparing the data
numeric_data <- data.frame(LotArea = training_data$LotArea, TotalBsmtSF = training_data$TotalBsmtSF, GrLivArea = training_data$GrLivArea, GarageArea = training_data$GarageArea, SalePrice = training_data$SalePrice)
#Checking that there are no missing values in the data
sum(is.na(numeric_data))
## [1] 0
#Perform principal component analysis
pc_analysis <- prcomp(numeric_data, center = TRUE, scale = TRUE)
print(pc_analysis)
## Standard deviations (1, .., p=5):
## [1] 1.6786596 0.9387993 0.7493938 0.7083844 0.4871943
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## LotArea 0.2579808 -0.9520939 -0.0378444 0.15881280 0.01759264
## TotalBsmtSF 0.4592324 0.0442588 -0.6120449 -0.60157245 -0.22507431
## GrLivArea 0.4783276 0.0646521 0.7072417 -0.16672514 -0.48891169
## GarageArea 0.4554486 0.2586563 -0.3067066 0.76253406 -0.22391155
## SalePrice 0.5350895 0.1430925 0.1723632 -0.06027949 0.81231830
The first principal component increases with TotalBsmtSF, GrLivArea, GarageArea, and SalePrice. This component is a measure of the size (including basement area, above ground area, and garage area) and price of a house, which tend to vary together (this makes sense since houses that are large and have big garages and basements are likely to be more expensive than those that have a more modest size).
The second principal component decreases with LotArea, and can be viewed as a measure of how small the property is.
The third principal component increases with an increase in GrLivArea and a decrease in TotalBsmtSF (and, to a lesser extent, GarageArea). This could be interpreted as a measure of amount of the property dedicated to living space as opposed to storage (garage and basement).
The fourth principal component increases with an increase in GarageArea and a decrease in TotalBsmtSF. This could be interpreted as a measure of the share of non-living space that is in the house’s garage, as opposed to the basement (these will often be inversely related, since a garage often occupies the part of an underground space that the basement would fill).
The fifth principal component increases with SalePrice and decreases with GrLivArea. SalePrice correlates much more strongly than GrLivArea, suggesting that houses with high sale prices tend to have smaller living spaces (perhaps because space is more of a premium in expensive areas).
#Demonstrating that the minimum value of X is already greater than 0
min(X)
## [1] 334
#Loading the MASS library and fitting an exponential PDF
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fitdistr(X, "exponential")
## rate
## 6.598640e-04
## (1.726943e-05)
The optimal value of \(\lambda\) for this distribution is approximately 0.00066.
#Taking 1000 random samples
samples <- rexp(1000, 0.00066)
#Plotting the histograms and comparing
ggplot() + aes(samples) + geom_histogram(binwidth = 100)
ggplot() + aes(X) + geom_histogram(binwidth = 100)
The histograms have some very important differences. The “samples”
distribution reaches its peak and starts decreasing much earlier than
the “X” distribution (“samples” peaks just above 0, while “X” peaks just
shy of 2000). This results in somewhat different shapes for the
histograms, with the “samples” distribution starting high and then
falling, while the “X” distribution starts at 0, rises to a peak, and
then falls. The “samples” distribution generated a much higher maximum,
close to 9,000, than the “X” distribution (which maxes out at around
5,500). In general, the low “samples” data is lower and the high
“samples” data is higher than the “X” data, which is more compact or
“squashed”.
#Find the 5th and 95th percentiles using the CDF
qexp(c(0.05, 0.95), rate = 0.00066)
## [1] 77.71711 4538.98829
#Provide the empirical 5th and 95th percentile of the data
quantile(X, c(0.05, 0.95))
## 5% 95%
## 848.0 2466.1
These results are consistent with what we observed in the histograms, the “samples” data is more spread out with a lower 5th percentile and a larger 95th percentile than the more compact “X” data, which has a greater 5th percentile and smaller 95th percentile.
#Generate a 95% confidence interval from the data
t.test(X)$conf.int
## [1] 1488.487 1542.440
## attr(,"conf.level")
## [1] 0.95
As opposed to the previous calculations, which provided the 5th and 95th percentile values for the sample and empirical data sets, this calculation determines that there is a 95% chance that the population mean for X is between 1488 and 1542.
#Replacing NA values with likely replacements
training_data$LotFrontage <- training_data$LotFrontage %>% replace_na(0)
training_data$Alley <- training_data$Alley %>% replace_na("None")
training_data$MasVnrType <- training_data$MasVnrType %>% replace_na("None")
training_data$MasVnrArea <- training_data$MasVnrArea %>% replace_na(0)
training_data$BsmtQual <- training_data$BsmtQual %>% replace_na("Nobase")
training_data$BsmtCond <- training_data$BsmtCond %>% replace_na("Nobase")
training_data$BsmtExposure <- training_data$BsmtExposure %>% replace_na("Nobase")
training_data$BsmtFinType1 <- training_data$BsmtFinType1 %>% replace_na("None")
training_data$BsmtFinType2 <- training_data$BsmtFinType2 %>% replace_na("None")
training_data$Electrical <- training_data$Electrical %>% replace_na("SBrkr")
training_data$FireplaceQu <- training_data$FireplaceQu %>% replace_na("None")
training_data$GarageType <- training_data$GarageType %>% replace_na("None")
training_data$GarageYrBlt <- training_data$GarageYrBlt %>% replace_na(round(mean(training_data$GarageYrBlt, na.rm = TRUE)))
training_data$GarageFinish <- training_data$GarageFinish %>% replace_na("None")
training_data$GarageQual <- training_data$GarageQual %>% replace_na("None")
training_data$GarageCond <- training_data$GarageCond %>% replace_na("None")
training_data$PoolQC <- training_data$PoolQC %>% replace_na("None")
training_data$Fence <- training_data$Fence %>% replace_na("None")
training_data$MiscFeature <- training_data$MiscFeature %>% replace_na("None")
regression_model <- lm(SalePrice ~ . - Id - Alley - FireplaceQu - LotFrontage - Exterior1st - Exterior2nd - MasVnrType - Foundation - HouseStyle - OpenPorchSF - MiscFeature - MiscVal - PavedDrive - LotShape - EnclosedPorch - ExterCond - CentralAir - GarageYrBlt - GarageFinish - YrSold - BsmtHalfBath - HalfBath - Electrical - BsmtFullBath - LowQualFinSF - X3SsnPorch - GrLivArea - BsmtFinType2 - GarageType - BldgType - WoodDeckSF - FullBath - Utilities - YearRemodAdd - MoSold - LandContour - BsmtCond - RoofStyle - LotConfig, data = training_data)
summary(regression_model)
##
## Call:
## lm(formula = SalePrice ~ . - Id - Alley - FireplaceQu - LotFrontage -
## Exterior1st - Exterior2nd - MasVnrType - Foundation - HouseStyle -
## OpenPorchSF - MiscFeature - MiscVal - PavedDrive - LotShape -
## EnclosedPorch - ExterCond - CentralAir - GarageYrBlt - GarageFinish -
## YrSold - BsmtHalfBath - HalfBath - Electrical - BsmtFullBath -
## LowQualFinSF - X3SsnPorch - GrLivArea - BsmtFinType2 - GarageType -
## BldgType - WoodDeckSF - FullBath - Utilities - YearRemodAdd -
## MoSold - LandContour - BsmtCond - RoofStyle - LotConfig,
## data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -185086 -9490 400 9248 185086
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.526e+06 1.439e+05 -10.605 < 2e-16 ***
## MSSubClass -1.524e+02 2.278e+01 -6.690 3.29e-11 ***
## MSZoningFV 3.282e+04 1.128e+04 2.909 0.003681 **
## MSZoningRH 2.600e+04 1.132e+04 2.298 0.021728 *
## MSZoningRL 2.830e+04 9.561e+03 2.959 0.003137 **
## MSZoningRM 2.120e+04 9.014e+03 2.351 0.018847 *
## LotArea 6.686e-01 9.058e-02 7.382 2.75e-13 ***
## StreetPave 3.588e+04 1.113e+04 3.224 0.001295 **
## LandSlopeMod 4.058e+03 3.311e+03 1.226 0.220517
## LandSlopeSev -2.926e+04 9.551e+03 -3.064 0.002229 **
## NeighborhoodBlueste 1.467e+04 1.838e+04 0.798 0.424760
## NeighborhoodBrDale 2.629e+03 9.561e+03 0.275 0.783379
## NeighborhoodBrkSide 6.997e+03 8.150e+03 0.859 0.390749
## NeighborhoodClearCr -6.183e+03 8.218e+03 -0.752 0.451937
## NeighborhoodCollgCr -1.263e+03 6.530e+03 -0.193 0.846688
## NeighborhoodCrawfor 1.995e+04 7.576e+03 2.633 0.008555 **
## NeighborhoodEdwards -1.023e+04 7.116e+03 -1.437 0.150959
## NeighborhoodGilbert -1.730e+03 6.807e+03 -0.254 0.799394
## NeighborhoodIDOTRR 5.260e+03 9.417e+03 0.559 0.576551
## NeighborhoodMeadowV 1.350e+03 9.326e+03 0.145 0.884894
## NeighborhoodMitchel -1.397e+04 7.310e+03 -1.911 0.056178 .
## NeighborhoodNAmes -7.362e+03 6.912e+03 -1.065 0.286972
## NeighborhoodNoRidge 3.562e+04 7.591e+03 4.692 2.98e-06 ***
## NeighborhoodNPkVill 6.093e+03 1.003e+04 0.607 0.543857
## NeighborhoodNridgHt 2.117e+04 6.738e+03 3.143 0.001712 **
## NeighborhoodNWAmes -1.026e+04 7.159e+03 -1.433 0.152004
## NeighborhoodOldTown 7.136e+02 8.318e+03 0.086 0.931640
## NeighborhoodSawyer -2.751e+03 7.283e+03 -0.378 0.705721
## NeighborhoodSawyerW 1.143e+03 7.012e+03 0.163 0.870567
## NeighborhoodSomerst 7.789e+03 8.138e+03 0.957 0.338669
## NeighborhoodStoneBr 4.137e+04 7.600e+03 5.443 6.24e-08 ***
## NeighborhoodSWISU 4.256e+02 8.653e+03 0.049 0.960775
## NeighborhoodTimber -5.278e+03 7.356e+03 -0.718 0.473137
## NeighborhoodVeenker 5.897e+03 9.602e+03 0.614 0.539231
## Condition1Feedr 6.018e+03 4.718e+03 1.276 0.202297
## Condition1Norm 1.517e+04 3.865e+03 3.926 9.06e-05 ***
## Condition1PosA 9.579e+03 9.460e+03 1.013 0.311445
## Condition1PosN 1.509e+04 6.972e+03 2.164 0.030662 *
## Condition1RRAe -8.965e+03 8.743e+03 -1.025 0.305391
## Condition1RRAn 1.479e+04 6.485e+03 2.281 0.022725 *
## Condition1RRNe 5.183e+03 1.706e+04 0.304 0.761273
## Condition1RRNn 1.014e+04 1.193e+04 0.850 0.395556
## Condition2Feedr -2.165e+04 2.119e+04 -1.022 0.307026
## Condition2Norm -1.648e+04 1.820e+04 -0.906 0.365347
## Condition2PosA 3.013e+04 3.011e+04 1.001 0.317095
## Condition2PosN -2.501e+05 2.584e+04 -9.678 < 2e-16 ***
## Condition2RRAe -5.264e+04 2.950e+04 -1.784 0.074651 .
## Condition2RRAn -2.293e+04 2.972e+04 -0.771 0.440586
## Condition2RRNn -7.380e+03 2.519e+04 -0.293 0.769572
## OverallQual 6.623e+03 9.434e+02 7.021 3.52e-12 ***
## OverallCond 6.153e+03 7.167e+02 8.585 < 2e-16 ***
## YearBuilt 3.486e+02 5.737e+01 6.077 1.60e-09 ***
## RoofMatlCompShg 5.958e+05 4.321e+04 13.787 < 2e-16 ***
## RoofMatlMembran 6.564e+05 5.110e+04 12.845 < 2e-16 ***
## RoofMatlMetal 6.304e+05 5.055e+04 12.472 < 2e-16 ***
## RoofMatlRoll 5.948e+05 4.961e+04 11.989 < 2e-16 ***
## RoofMatlTar&Grv 5.817e+05 4.435e+04 13.117 < 2e-16 ***
## RoofMatlWdShake 6.043e+05 4.469e+04 13.524 < 2e-16 ***
## RoofMatlWdShngl 6.463e+05 4.397e+04 14.698 < 2e-16 ***
## MasVnrArea 1.251e+01 4.389e+00 2.851 0.004420 **
## ExterQualFa -1.293e+04 9.227e+03 -1.401 0.161388
## ExterQualGd -2.434e+04 4.571e+03 -5.323 1.19e-07 ***
## ExterQualTA -2.419e+04 5.067e+03 -4.774 2.01e-06 ***
## BsmtQualFa -1.529e+04 5.892e+03 -2.594 0.009585 **
## BsmtQualGd -2.104e+04 3.167e+03 -6.642 4.52e-11 ***
## BsmtQualNobase 6.655e+03 2.422e+04 0.275 0.783481
## BsmtQualTA -1.907e+04 3.880e+03 -4.915 9.96e-07 ***
## BsmtExposureGd 1.599e+04 2.887e+03 5.537 3.70e-08 ***
## BsmtExposureMn -2.452e+03 2.846e+03 -0.862 0.389022
## BsmtExposureNo -6.323e+03 1.982e+03 -3.190 0.001456 **
## BsmtExposureNobase -1.391e+04 2.306e+04 -0.603 0.546474
## BsmtFinType1BLQ 2.446e+03 2.592e+03 0.944 0.345536
## BsmtFinType1GLQ 4.805e+03 2.380e+03 2.019 0.043693 *
## BsmtFinType1LwQ -2.336e+03 3.404e+03 -0.686 0.492637
## BsmtFinType1None NA NA NA NA
## BsmtFinType1Rec -1.040e+03 2.731e+03 -0.381 0.703533
## BsmtFinType1Unf 2.715e+03 2.744e+03 0.989 0.322705
## BsmtFinSF1 3.847e+01 4.433e+00 8.678 < 2e-16 ***
## BsmtFinSF2 3.043e+01 5.725e+00 5.316 1.25e-07 ***
## BsmtUnfSF 2.030e+01 4.171e+00 4.867 1.27e-06 ***
## TotalBsmtSF NA NA NA NA
## HeatingGasA 4.012e+03 2.433e+04 0.165 0.869038
## HeatingGasW 4.584e+03 2.506e+04 0.183 0.854913
## HeatingGrav -1.178e+03 2.656e+04 -0.044 0.964643
## HeatingOthW -2.440e+04 2.993e+04 -0.815 0.415081
## HeatingWall 1.938e+04 2.699e+04 0.718 0.472998
## HeatingQCFa -2.129e+03 4.345e+03 -0.490 0.624201
## HeatingQCGd -4.761e+03 1.972e+03 -2.415 0.015873 *
## HeatingQCPo 5.059e+02 2.490e+04 0.020 0.983792
## HeatingQCTA -4.960e+03 1.906e+03 -2.602 0.009359 **
## X1stFlrSF 4.955e+01 4.696e+00 10.552 < 2e-16 ***
## X2ndFlrSF 5.936e+01 3.234e+00 18.354 < 2e-16 ***
## BedroomAbvGr -4.424e+03 1.253e+03 -3.532 0.000427 ***
## KitchenAbvGr -1.430e+04 4.027e+03 -3.550 0.000398 ***
## KitchenQualFa -2.163e+04 5.762e+03 -3.754 0.000182 ***
## KitchenQualGd -2.388e+04 3.330e+03 -7.172 1.23e-12 ***
## KitchenQualTA -2.403e+04 3.728e+03 -6.446 1.61e-10 ***
## TotRmsAbvGrd 2.134e+03 8.892e+02 2.400 0.016536 *
## FunctionalMaj2 -4.248e+03 1.295e+04 -0.328 0.742983
## FunctionalMin1 2.225e+03 7.950e+03 0.280 0.779646
## FunctionalMin2 5.269e+03 7.810e+03 0.675 0.500030
## FunctionalMod -2.095e+03 9.287e+03 -0.226 0.821544
## FunctionalSev -4.642e+04 2.591e+04 -1.791 0.073473 .
## FunctionalTyp 1.433e+04 6.826e+03 2.099 0.035993 *
## Fireplaces 2.626e+03 1.262e+03 2.081 0.037671 *
## GarageCars 5.053e+03 2.179e+03 2.319 0.020573 *
## GarageArea 1.577e+01 7.195e+00 2.191 0.028604 *
## GarageQualFa -1.120e+05 2.706e+04 -4.140 3.69e-05 ***
## GarageQualGd -1.044e+05 2.782e+04 -3.751 0.000184 ***
## GarageQualNone 5.659e+03 1.707e+04 0.332 0.740316
## GarageQualPo -1.109e+05 3.305e+04 -3.356 0.000814 ***
## GarageQualTA -1.090e+05 2.682e+04 -4.065 5.08e-05 ***
## GarageCondFa 1.009e+05 3.199e+04 3.154 0.001647 **
## GarageCondGd 9.618e+04 3.290e+04 2.924 0.003517 **
## GarageCondNone NA NA NA NA
## GarageCondPo 1.038e+05 3.438e+04 3.018 0.002590 **
## GarageCondTA 1.044e+05 3.162e+04 3.301 0.000989 ***
## ScreenPorch 3.173e+01 1.183e+01 2.683 0.007397 **
## PoolArea 6.082e+02 1.659e+02 3.666 0.000256 ***
## PoolQCFa -1.502e+05 2.587e+04 -5.806 7.98e-09 ***
## PoolQCGd -1.122e+05 3.090e+04 -3.630 0.000294 ***
## PoolQCNone 2.097e+05 9.010e+04 2.327 0.020090 *
## FenceGdWo 8.505e+03 4.656e+03 1.827 0.067964 .
## FenceMnPrv 9.054e+03 3.791e+03 2.388 0.017058 *
## FenceMnWw 5.659e+02 7.857e+03 0.072 0.942591
## FenceNone 7.331e+03 3.445e+03 2.128 0.033508 *
## SaleTypeCon 3.830e+04 1.731e+04 2.213 0.027093 *
## SaleTypeConLD 1.868e+04 9.235e+03 2.023 0.043305 *
## SaleTypeConLI 4.404e+03 1.121e+04 0.393 0.694619
## SaleTypeConLw 5.172e+03 1.153e+04 0.449 0.653801
## SaleTypeCWD 1.727e+04 1.255e+04 1.376 0.168977
## SaleTypeNew 3.632e+04 1.496e+04 2.428 0.015298 *
## SaleTypeOth 1.340e+04 1.415e+04 0.947 0.343639
## SaleTypeWD 2.153e+03 3.993e+03 0.539 0.589926
## SaleConditionAdjLand 1.554e+04 1.243e+04 1.250 0.211569
## SaleConditionAlloca 1.279e+03 8.078e+03 0.158 0.874240
## SaleConditionFamily -2.514e+01 5.957e+03 -0.004 0.996634
## SaleConditionNormal 8.309e+03 2.761e+03 3.009 0.002670 **
## SaleConditionPartial -1.024e+04 1.447e+04 -0.708 0.478987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22860 on 1324 degrees of freedom
## Multiple R-squared: 0.9249, Adjusted R-squared: 0.9172
## F-statistic: 120.8 on 135 and 1324 DF, p-value: < 2.2e-16
#Import the test data
test_data <- read.csv("https://raw.githubusercontent.com/Marley-Myrianthopoulos/grad_school_data/main/data_605_final_test.csv")
#Replace NA Values
test_data$LotFrontage <- test_data$LotFrontage %>% replace_na(0)
test_data$Alley <- test_data$Alley %>% replace_na("None")
test_data$MasVnrType <- test_data$MasVnrType %>% replace_na("None")
test_data$MasVnrArea <- test_data$MasVnrArea %>% replace_na(0)
test_data$BsmtQual <- test_data$BsmtQual %>% replace_na("Nobase")
test_data$BsmtCond <- test_data$BsmtCond %>% replace_na("Nobase")
test_data$BsmtExposure <- test_data$BsmtExposure %>% replace_na("Nobase")
test_data$BsmtFinType1 <- test_data$BsmtFinType1 %>% replace_na("None")
test_data$BsmtFinType2 <- test_data$BsmtFinType2 %>% replace_na("None")
test_data$Electrical <- test_data$Electrical %>% replace_na("SBrkr")
test_data$FireplaceQu <- test_data$FireplaceQu %>% replace_na("None")
test_data$GarageType <- test_data$GarageType %>% replace_na("None")
test_data$GarageYrBlt <- test_data$GarageYrBlt %>% replace_na(round(mean(training_data$GarageYrBlt, na.rm = TRUE)))
test_data$GarageFinish <- test_data$GarageFinish %>% replace_na("None")
test_data$GarageQual <- test_data$GarageQual %>% replace_na("None")
test_data$GarageCond <- test_data$GarageCond %>% replace_na("None")
test_data$PoolQC <- test_data$PoolQC %>% replace_na("None")
test_data$Fence <- test_data$Fence %>% replace_na("None")
test_data$MiscFeature <- test_data$MiscFeature %>% replace_na("None")
test_data$MSZoning <- test_data$MSZoning %>% replace_na("RL")
test_data$Utilities <- test_data$Utilities %>% replace_na("AllPub")
test_data$Exterior1st <- test_data$Exterior1st %>% replace_na("VinylSd")
test_data$Exterior2nd <- test_data$Exterior2nd %>% replace_na("VinylSd")
test_data$BsmtFinSF1 <- test_data$BsmtFinSF1 %>% replace_na(0)
test_data$BsmtFinSF2 <- test_data$BsmtFinSF2 %>% replace_na(0)
test_data$BsmtUnfSF <- test_data$BsmtUnfSF %>% replace_na(0)
test_data$TotalBsmtSF <- test_data$TotalBsmtSF %>% replace_na(0)
test_data$BsmtFullBath <- test_data$BsmtFullBath %>% replace_na(0)
test_data$BsmtHalfBath <- test_data$BsmtHalfBath %>% replace_na(0)
test_data$KitchenQual <- test_data$KitchenQual %>% replace_na("TA")
test_data$Functional <- test_data$Functional %>% replace_na("Typ")
test_data$GarageCars <- test_data$GarageCars %>% replace_na(0)
test_data$GarageArea <- test_data$GarageArea %>% replace_na(0)
test_data$SaleType <- test_data$SaleType %>% replace_na("WD")
#Confirm that NA values are resolved in the test data
sum(is.na(test_data))
## [1] 0
#Use the regression equation to predict sale prices
price_estimates <- predict(regression_model, test_data)
## Warning in predict.lm(regression_model, test_data): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
predictions_df <- data.frame(Id = test_data$Id, SalePrice = price_estimates)
#Get working directory to create the .csv file in
wd <- getwd()
file_path <- paste0(wd, ".myrianthopoulosfinal.csv")
write.csv(predictions_df, file_path, row.names = FALSE)
I submitted these predictions to Kaggle under the username MarleyMyr, resulting in a score of 0.17182.