House Prices - Advanced Regression Techniques https://www.kaggle.com/c/house-prices-advanced-regression-techniques
# Needed libraries
library(ggplot2)
library(moments)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(matrixcalc)
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
1. Pick one of the quantitative independent variables from the training data set (train.csv), and define that variable as X. Make sure this variable is skewed to the right! Pick the dependent variable and define it as Y.
# Read train.csv
train_data <- read.csv("https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/train.csv")
# Sample
head(train_data)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
# Summary
summary(train_data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
# Check for nulls/missing data
anyNA(train_data)
## [1] TRUE
# How many missing values?
sum(is.na(train_data))
## [1] 6965
# 6,965 missing values
# Missing values from each column
Missing_values <- colSums(is.na(train_data))
Missing_values[Missing_values>0]
## LotFrontage Alley MasVnrType MasVnrArea BsmtQual BsmtCond
## 259 1369 8 8 37 37
## BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType
## 38 37 38 1 690 81
## GarageYrBlt GarageFinish GarageQual GarageCond PoolQC Fence
## 81 81 81 81 1453 1179
## MiscFeature
## 1406
# Let's replace the missing numeric records with the average value of each column
for(i in 1:ncol(train_data)){
train_data[is.na(train_data[,i]), i] <- mean(train_data[,i], na.rm = TRUE)
}
# This filled out the missing values in LotFrontage (259), MasVnrArea (8), GarageYrBlt (81) so we went from 6,965 to 6,617 missing.
# Now replace the missing values in the categorical variables (https://stackoverflow.com/questions/36377813/impute-most-frequent-categorical-value-in-all-columns-in-data-frame)
i1 <- !sapply(train_data, is.numeric)
# Most common value
Mode <- function(x) {
ux <- sort(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
# Replace the NAs in character columns with the most freq
train_data[i1] <- lapply(train_data[i1], function(x)
replace(x, is.na(x), Mode(x[!is.na(x)])))
# Check to see if any values are missing/NA
sum(is.na(train_data)) #0, so all values have been imputed
## [1] 0
# Check to see if these variables have a positive (right) skew
skewness(train_data$LotFrontage) #2.641832
## [1] 2.382499
skewness(train_data$LotArea) #12.19514
## [1] 12.19514
skewness(train_data$GrLivArea) #1.365156
## [1] 1.365156
skewness(train_data$MiscVal) #24.45164
## [1] 24.45164
# Visualize the distributions for each
par(mfrow = c(2, 2))
hist(train_data$LotFrontage, main="Lot Frontage Histogram")
hist(train_data$LotArea, main="Lot Area Histogram")
hist(train_data$GrLivArea, main="Above Ground Living Area (Sq. Ft.) Histogram")
hist(train_data$MiscVal, main="Miscellaneous Feature $ Value Histogram")
# Summary of our independent variable
summary(train_data$GrLivArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
sd(train_data$GrLivArea)
## [1] 525.4804
# Histogram
hist(train_data$GrLivArea, main="Histogram of GrLivArea", bins=50, xlab="GrLivArea",border="black", col="green")
# Density
plot(density(train_data$GrLivArea), main="Density of GrLivArea",border="black", col="green")
# Log transform
ggplot(train_data, aes(x = log(train_data$GrLivArea), fill = ..count..)) +
geom_histogram(binwidth = 0.05) +
ggtitle("Histogram of log GrLivArea") +
theme(plot.title = element_text(hjust = 0.5))
# Histogram of Y- SalePrice
ggplot(train_data, aes(x = SalePrice, fill = ..count..)) +
geom_histogram(binwidth = 5000) +
ggtitle("Histogram of SalePrice") +
ylab("Count of Houses") +
xlab("House Price") +
theme(plot.title = element_text(hjust = 0.5))
# This histogram is skewed, so we should log transform it.
ggplot(train_data, aes(x = log(train_data$SalePrice), fill = ..count..)) +
geom_histogram(binwidth = 0.05) +
ggtitle("Histogram of log SalePrice") +
ylab("Count of Houses") +
xlab("House Price") +
theme(plot.title = element_text(hjust = 0.5))
2. Probability. Calculate as a minimum the below probabilities a through c. Assume the small letter “\(x\)” is estimated as the 3d quartile of the \(X\) variable, and the small letter “\(y\)” is estimated as the 2d quartile of the \(Y\) variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
# First, Check the quantiles for each of our variables to get the numeric context to answer the above question
X <- train_data$GrLivArea
Y <- train_data$SalePrice
quantile(X)
## 0% 25% 50% 75% 100%
## 334.00 1129.50 1464.00 1776.75 5642.00
#x is 3d quartile of X = 1,776.75
quantile(Y)
## 0% 25% 50% 75% 100%
## 34900 129975 163000 214000 755000
#y is 2d quartile of Y = 163,000
# Define the thresholds for later
x_threshold <- 1776.75 # 3rd quartile of X
y_threshold <- 163000 # 2nd quartile of Y
# Both X and Y exceed their thresholds: 315
# nrow(train_data[X > x_threshold & Y > y_threshold,])
# Y exceeds threshold: 728
# nrow(train_data[Y > y_threshold, ])
# Probability that Ground Living Area is more than 1,776.75 sq. ft., given that the Sales Price is greater than $163,000
P1 <- nrow(train_data[X > x_threshold & Y > y_threshold, ]) / nrow(train_data[Y > y_threshold, ])
cat("The probability is : ", P1*100,"%")
## The probability is : 43.26923 %
# X exceeds threshold: 365
# nrow(train_data[X > x_threshold,])
# Y exceeds threshold: 728
# nrow(train_data[Y> y_threshold, ])
# Probability that Ground Living Area is more than 1,776.75 sq. ft., and that the Sales Price is greater than $163,000
P2 <- nrow(subset(train_data, X > x_threshold, data=train_data && Y > y_threshold, data=data)) / nrow(train_data)
cat("The probability is : ", P2*100,"%")
## The probability is : 25 %
# Probability that Ground Living Area is less than 1,776.75 sq. ft., given that the Sales Price is greater than $163,000
P3 <- nrow(train_data[X < x_threshold & Y > y_threshold, ]) / nrow(train_data[Y > y_threshold, ])
cat("The probability is : ", P3*100,"%")
## The probability is : 56.73077 %
# Let's get the numbers for our chart above;
count(subset(train_data, ( X <= x_threshold & Y <= y_threshold)))
## n
## 1 682
count(subset(train_data, ( X > x_threshold & Y <= y_threshold)))
## n
## 1 50
count(subset(train_data, ( X <= x_threshold & Y > y_threshold)))
## n
## 1 413
count(subset(train_data, ( X > x_threshold & Y > y_threshold)))
## n
## 1 315
x/y | <=2d quartile | >2d quartile | Total |
---|---|---|---|
<=3d quartile | 682 | 413 | 1095 |
>3d quartile | 50 | 315 | 365 |
Total | 732 | 728 | 1460 |
3. Does splitting the training data in this fashion make them independent? Let \(A\) be the new variable counting those observations above the 3d quartile for \(X\), and let \(B\) be the new variable counting those observations above the 2d quartile for \(Y\). Does \(P(A|B)=P(A)P(B)\)? Check mathematically, and then evaluate by running a Chi Square test for association.
# Check mathematically
# Our quantile values, defined in Question 2.
x_threshold <- 1776.75 # 3rd quartile of X
y_threshold <- 163000 # 2nd quartile of Y
# Our probability we found in 2a
cat("The probability P(A|B) is : ", P1*100,"% \n")
## The probability P(A|B) is : 43.26923 %
# Define A and B, which are the counts of observations in our dataset train.csv, that are above x and y respectively.
A <- subset(train_data, X > x_threshold)
B <- subset(train_data, Y > y_threshold)
# Find probabilities of A and B from our original dataset
P_A <- nrow(A)/nrow(train_data) #0.25
P_B <- nrow(B)/nrow(train_data) #0.49863
# Calculate P(A) * P(B) = P_A * P_B #0.1246575
cat("The probability P(A)P(B) is : ", P_A * P_B*100,"%")
## The probability P(A)P(B) is : 12.46575 %
# Define the calculated number of observations
observed <- matrix(c(682, 413, 50, 315), nrow = 2, byrow = TRUE)
# Perform the chi-square test
chi_square_result <- chisq.test(observed)
# Print result
print(chi_square_result)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: observed
## X-squared = 256.53, df = 1, p-value < 2.2e-16
4. Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots for the training data set. Provide a scatterplot of \(X\) and \(Y\). Provide a 95% Confidence Interval for the difference in the mean of the variables. Derive a correlation matrix for two of the quantitative variables you selected. Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
# Histograms
par(mfrow = c(1, 2))
hist(X, main="GrLivArea")
hist(Y, main="SalePrice")
# Scatterplot
ggplot(train_data, aes(x = X, y = Y)) +
geom_point() +
labs(x = "GrLivArea", y = "SalePrice", title = "GrLivArea vs SalePrice") +
geom_smooth(method=lm)
## `geom_smooth()` using formula = 'y ~ x'
# mean(X) #1515.464
# mean(Y) #180921.2
t_test_result <- t.test(X, Y, paired = TRUE, conf.level = 0.95)
# Paired is true to compare the means between two related groups of samples
# If 100 sample distributions are taken from the population, 95 of them are likely to have prices within the confidence interval -183465.0 and -175346.4
# Extract confidence intervals
ci_lower <- t_test_result$conf.int[1]
ci_upper <- t_test_result$conf.int[2]
# Print confidence interval
cat("95% Confidence Interval for the difference in means: (", ci_lower, ", ", ci_upper, ")\n")
## 95% Confidence Interval for the difference in means: ( -183465 , -175346.4 )
correlation_matrix <- cor(train_data[, c("GrLivArea", "SalePrice")])
print(correlation_matrix)
## GrLivArea SalePrice
## GrLivArea 1.0000000 0.7086245
## SalePrice 0.7086245 1.0000000
print(cor.test(X, Y, method = "pearson", conf.level = 0.99))
##
## Pearson's product-moment correlation
##
## data: X and Y
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.6733974 0.7406408
## sample estimates:
## cor
## 0.7086245
5. Linear Algebra and Correlation. Invert your correlation matrix. (This is known as the precision matrix and contains variance inflation factors on the diagonal.) Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix. Conduct principle components analysis (research this!) and interpret. Discuss.
# Invert the correlation matrix
inverted_matrix <- solve(correlation_matrix)
# Print the inverted matrix
print(inverted_matrix)
## GrLivArea SalePrice
## GrLivArea 2.008632 -1.423366
## SalePrice -1.423366 2.008632
# Multiply correlation matrix by precision matrix
round(correlation_matrix %*% inverted_matrix)
## GrLivArea SalePrice
## GrLivArea 1 0
## SalePrice 0 1
# Multiply precision matrix by correlation matrix
round(inverted_matrix %*% correlation_matrix)
## GrLivArea SalePrice
## GrLivArea 1 0
## SalePrice 0 1
# The mean correlation is high and thus conducting PCA is feasible
(correlation_matrix[1][1] + correlation_matrix[2][1]) / 2
## [1] 0.8543122
# PCA (https://www.datacamp.com/tutorial/pca-analysis-r)
data.pca <- princomp(data.frame(X, Y), cor=TRUE)
summary(data.pca)
## Importance of components:
## Comp.1 Comp.2
## Standard deviation 1.3071436 0.5397921
## Proportion of Variance 0.8543122 0.1456878
## Cumulative Proportion 0.8543122 1.0000000
# The components relation to the columns can be checked via the loadings matrix
data.pca$loadings[, 1:2]
## Comp.1 Comp.2
## X 0.7071068 0.7071068
## Y 0.7071068 -0.7071068
# Scree Plot
fviz_eig(data.pca, addlabels = TRUE)
# Biplot of attributes
biplot(data.pca)
fviz_pca_var(data.pca, col.var = "black")
6. Calculus-Based Probability & Statistics. Many times, it makes sense to fit a closed form distribution to data. For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html). Find the optimal value of \(λ\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(λ\))). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.
# Basic info of X
skewness(X)
## [1] 1.365156
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
# Run fitdistr to fit an exponential PDF
exp_pdf = fitdistr(X, "exponential")
# Find optimal λ
lambda = exp_pdf$estimate
# 1000 Samples from exponential distribution
samples <- rexp(1000, rate = lambda)
# Plot a histogram and compare it with a histogram of your original variable.
par(mfrow = c(1, 2))
hist(X, main = "Original", xlab = "GrLivArea")
hist(samples, main = "Exponential Distribution", xlab = "GrLivArea")
# Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF)
quantile(samples, probs=c(0.05, 0.95))
## 5% 95%
## 59.9654 4436.0060
# Generate a 95% confidence interval from the empirical data, assuming normality
mean_x <- mean(X)
std_x <- sd(X)
n <- length(X)
z <- qnorm(1 - 0.05/2)
lower_ci <- mean_x - z * (std_x / sqrt(n))
upper_ci <- mean_x + z * (std_x / sqrt(n))
# Provide the empirical 5th percentile and 95th percentile of the data.
quantile(X, probs=c(0.05, 0.95))
## 5% 95%
## 848.0 2466.1
7. Modeling. Build some type of regression model and submit your model to the competition board. Provide your complete model summary and results with analysis. Report your Kaggle.com user name and score.
# Simple linear regression model of SalePrice by GrLivArea, for later comparison to our multi-linear regression model
train.lm <- lm(Y ~ X, data=train_data)
# Get summary of our model
summary(train.lm)
##
## Call:
## lm(formula = Y ~ X, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -462999 -29800 -1124 21957 339832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18569.026 4480.755 4.144 3.61e-05 ***
## X 107.130 2.794 38.348 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56070 on 1458 degrees of freedom
## Multiple R-squared: 0.5021, Adjusted R-squared: 0.5018
## F-statistic: 1471 on 1 and 1458 DF, p-value: < 2.2e-16
# Residuals scatterplot
plot(train.lm$fitted.values, train.lm$residuals, xlab="Fitted Values", ylab="Residuals")
abline(h=0, col="red")
# Residuals Histogram
hist(train.lm$residuals, main="Simple Linear Residuals Histogram")
# QQ Plot
qqnorm(train.lm$residuals)
qqline(train.lm$residuals)
# Plot
par(mfrow=c(2,2))
plot(train.lm)
# This has R^2 value of 50%- not bad, but could be better. Let's do multiple regression
# Visualize relationship between numeric fields and SalePrice
par(mfrow=c(2,2))
for (i in 2:(ncol(train_data)-1)) {
if (is.numeric(train_data[ ,i]) == "TRUE"){
plot(train_data$SalePrice ~ train_data[,i], main=names(train_data)[i], xlab=names(train_data)[i], ylab="SalePrice", col="black")
reg_line <- lm(train_data$SalePrice ~ train_data[, i])
abline(reg_line,col="red")
}}
# Multiple regression model- numeric fields only (35 fields)
train.lm2 <- lm(SalePrice ~ LotFrontage + LotArea + OverallQual + OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath + FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + KitchenQual + TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea + WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch + PoolArea + MiscVal + MoSold + YrSold, data = train_data)
# Check summary of our multiple-regression model
summary(train.lm2) #0.8244
##
## Call:
## lm(formula = SalePrice ~ LotFrontage + LotArea + OverallQual +
## OverallCond + YearBuilt + YearRemodAdd + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF + X1stFlrSF + X2ndFlrSF +
## LowQualFinSF + GrLivArea + BsmtFullBath + BsmtHalfBath +
## FullBath + HalfBath + BedroomAbvGr + KitchenAbvGr + KitchenQual +
## TotRmsAbvGrd + Fireplaces + GarageYrBlt + GarageCars + GarageArea +
## WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch +
## PoolArea + MiscVal + MoSold + YrSold, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -488365 -14875 -1760 13171 284820
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.193e+05 1.372e+06 0.524 0.600113
## LotFrontage 2.860e+01 4.780e+01 0.598 0.549682
## LotArea 4.862e-01 9.910e-02 4.906 1.03e-06 ***
## OverallQual 1.359e+04 1.186e+03 11.460 < 2e-16 ***
## OverallCond 5.301e+03 1.007e+03 5.261 1.65e-07 ***
## YearBuilt 3.037e+02 6.635e+01 4.577 5.13e-06 ***
## YearRemodAdd 7.119e+01 6.987e+01 1.019 0.308443
## MasVnrArea 2.599e+01 5.782e+00 4.494 7.56e-06 ***
## BsmtFinSF1 1.735e+01 4.543e+00 3.819 0.000140 ***
## BsmtFinSF2 9.916e+00 6.843e+00 1.449 0.147526
## BsmtUnfSF 9.448e+00 4.060e+00 2.327 0.020091 *
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.707e+01 5.643e+00 8.340 < 2e-16 ***
## X2ndFlrSF 4.148e+01 4.735e+00 8.759 < 2e-16 ***
## LowQualFinSF 8.739e+00 1.930e+01 0.453 0.650687
## GrLivArea NA NA NA NA
## BsmtFullBath 7.966e+03 2.534e+03 3.143 0.001706 **
## BsmtHalfBath 5.278e+02 3.961e+03 0.133 0.894015
## FullBath 4.346e+03 2.762e+03 1.574 0.115761
## HalfBath -1.454e+03 2.584e+03 -0.563 0.573683
## BedroomAbvGr -5.548e+03 1.669e+03 -3.325 0.000906 ***
## KitchenAbvGr -2.298e+04 4.734e+03 -4.854 1.34e-06 ***
## KitchenQualFa -3.794e+04 7.656e+03 -4.956 8.08e-07 ***
## KitchenQualGd -4.500e+04 4.087e+03 -11.012 < 2e-16 ***
## KitchenQualTA -5.118e+04 4.750e+03 -10.776 < 2e-16 ***
## TotRmsAbvGrd 4.044e+03 1.209e+03 3.345 0.000844 ***
## Fireplaces 3.882e+03 1.723e+03 2.253 0.024392 *
## GarageYrBlt 7.402e+01 6.758e+01 1.095 0.273543
## GarageCars 1.196e+04 2.792e+03 4.285 1.95e-05 ***
## GarageArea -2.548e+00 9.641e+00 -0.264 0.791595
## WoodDeckSF 2.551e+01 7.759e+00 3.288 0.001035 **
## OpenPorchSF -4.718e+00 1.472e+01 -0.321 0.748624
## EnclosedPorch 1.230e+01 1.641e+01 0.750 0.453574
## ScreenPorch 5.074e+01 1.668e+01 3.043 0.002388 **
## PoolArea -4.091e+01 2.305e+01 -1.774 0.076230 .
## MiscVal 8.316e-02 1.801e+00 0.046 0.963178
## MoSold -7.256e+00 3.341e+02 -0.022 0.982678
## YrSold -8.000e+02 6.809e+02 -1.175 0.240227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33700 on 1424 degrees of freedom
## Multiple R-squared: 0.8244, Adjusted R-squared: 0.82
## F-statistic: 191 on 35 and 1424 DF, p-value: < 2.2e-16
# Refine multiple-regression model with backwards elimination process for each field with high p-values (16 fields)
train.lm2 <- update(train.lm2, .~. -BsmtHalfBath)
train.lm2 <- update(train.lm2, .~. -MoSold)
train.lm2 <- update(train.lm2, .~. -MiscVal)
train.lm2 <- update(train.lm2, .~. -GarageArea)
train.lm2 <- update(train.lm2, .~. -OpenPorchSF)
train.lm2 <- update(train.lm2, .~. -LowQualFinSF)
train.lm2 <- update(train.lm2, .~. -LotFrontage)
train.lm2 <- update(train.lm2, .~. -EnclosedPorch)
train.lm2 <- update(train.lm2, .~. -HalfBath)
train.lm2 <- update(train.lm2, .~. -YearRemodAdd)
train.lm2 <- update(train.lm2, .~. -GrLivArea) # This is my independent variable, but it has a high p-value; 0.654496
train.lm2 <- update(train.lm2, .~. -GarageYrBlt)
train.lm2 <- update(train.lm2, .~. -BsmtFinSF2)
train.lm2 <- update(train.lm2, .~. -BsmtUnfSF)
train.lm2 <- update(train.lm2, .~. -YrSold)
train.lm2 <- update(train.lm2, .~. -PoolArea)
# scatterplot of fitted vs residual values
plot(train.lm2$fitted.values, train.lm2$residuals, xlab="Fitted Values", ylab="Residuals",main = "Residuals vs Fitted")
abline(h=0, col="red")
# Residuals Histogram
hist(train.lm2$residuals, main="Multiple Linear Residuals Histogram")
# QQ Plot
qqnorm(train.lm2$residuals)
qqline(train.lm2$residuals)
# Plot
par(mfrow=c(2,2))
plot(train.lm2)
# Read test.csv
test_data <- read.csv("https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/test.csv")
# Summary
summary(test_data)
## Id MSSubClass MSZoning LotFrontage
## Min. :1461 Min. : 20.00 Length:1459 Min. : 21.00
## 1st Qu.:1826 1st Qu.: 20.00 Class :character 1st Qu.: 58.00
## Median :2190 Median : 50.00 Mode :character Median : 67.00
## Mean :2190 Mean : 57.38 Mean : 68.58
## 3rd Qu.:2554 3rd Qu.: 70.00 3rd Qu.: 80.00
## Max. :2919 Max. :190.00 Max. :200.00
## NA's :227
## LotArea Street Alley LotShape
## Min. : 1470 Length:1459 Length:1459 Length:1459
## 1st Qu.: 7391 Class :character Class :character Class :character
## Median : 9399 Mode :character Mode :character Mode :character
## Mean : 9819
## 3rd Qu.:11518
## Max. :56600
##
## LandContour Utilities LotConfig LandSlope
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1459 Min. : 1.000 Min. :1.000 Min. :1879
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1953
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.079 Mean :5.554 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1459 Length:1459 Length:1459
## 1st Qu.:1963 Class :character Class :character Class :character
## Median :1992 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 100.7
## 3rd Qu.: 164.0
## Max. :1290.0
## NA's :15
## ExterCond Foundation BsmtQual BsmtCond
## Length:1459 Length:1459 Length:1459 Length:1459
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1459 Length:1459 Min. : 0.0 Length:1459
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 350.5 Mode :character
## Mean : 439.2
## 3rd Qu.: 753.5
## Max. :4010.0
## NA's :1
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0 Length:1459
## 1st Qu.: 0.00 1st Qu.: 219.2 1st Qu.: 784 Class :character
## Median : 0.00 Median : 460.0 Median : 988 Mode :character
## Mean : 52.62 Mean : 554.3 Mean :1046
## 3rd Qu.: 0.00 3rd Qu.: 797.8 3rd Qu.:1305
## Max. :1526.00 Max. :2140.0 Max. :5095
## NA's :1 NA's :1 NA's :1
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1459 Length:1459 Length:1459 Min. : 407.0
## Class :character Class :character Class :character 1st Qu.: 873.5
## Mode :character Mode :character Mode :character Median :1079.0
## Mean :1156.5
## 3rd Qu.:1382.5
## Max. :5095.0
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 407 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1118 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1432 Median :0.0000
## Mean : 326 Mean : 3.543 Mean :1486 Mean :0.4345
## 3rd Qu.: 676 3rd Qu.: 0.000 3rd Qu.:1721 3rd Qu.:1.0000
## Max. :1862 Max. :1064.000 Max. :5095 Max. :3.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.0652 Mean :1.571 Mean :0.3777 Mean :2.854
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.0000 Max. :4.000 Max. :2.0000 Max. :6.000
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1459 Min. : 3.000 Length:1459
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.042 Mean : 6.385
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:1459 Length:1459 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1959
## Median :0.0000 Mode :character Mode :character Median :1979
## Mean :0.5812 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :78
## GarageFinish GarageCars GarageArea GarageQual
## Length:1459 Min. :0.000 Min. : 0.0 Length:1459
## Class :character 1st Qu.:1.000 1st Qu.: 318.0 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.766 Mean : 472.8
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :5.000 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1459 Length:1459 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 28.00
## Mean : 93.17 Mean : 48.31
## 3rd Qu.: 168.00 3rd Qu.: 72.00
## Max. :1424.00 Max. :742.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 24.24 Mean : 1.794 Mean : 17.06 Mean : 1.744
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.00 Max. :360.000 Max. :576.00 Max. :800.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1459 Length:1459 Length:1459 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 58.17
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1459 Length:1459
## 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.104 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
# Check for nulls/missing data
anyNA(test_data)
## [1] TRUE
# How many missing values?
sum(is.na(test_data))
## [1] 7000
# 7000 missing values
# Missing values from each column
Missing_values2 <- colSums(is.na(test_data))
Missing_values2[Missing_values2>0]
## MSZoning LotFrontage Alley Utilities Exterior1st Exterior2nd
## 4 227 1352 2 1 1
## MasVnrType MasVnrArea BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 16 15 44 45 44 42
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath
## 1 42 1 1 1 2
## BsmtHalfBath KitchenQual Functional FireplaceQu GarageType GarageYrBlt
## 2 1 2 730 76 78
## GarageFinish GarageCars GarageArea GarageQual GarageCond PoolQC
## 78 1 1 78 78 1456
## Fence MiscFeature SaleType
## 1169 1408 1
# Let's replace the missing numeric records with the average value of each column
for(i in 1:ncol(test_data)){
test_data[is.na(test_data[ ,i]), i] <- mean(test_data[ ,i], na.rm = TRUE)
}
# Now replace the missing values in the categorical variables (https://stackoverflow.com/questions/36377813/impute-most-frequent-categorical-value-in-all-columns-in-data-frame)
i2 <- !sapply(test_data, is.numeric)
# Most common value
Mode <- function(x) {
ux <- sort(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
# Replace the NAs in character columns with the most freq
test_data[i2] <- lapply(test_data[i2], function(x)
replace(x, is.na(x), Mode(x[!is.na(x)])))
# Check to see if any values are missing/NA
sum(is.na(test_data))
## [1] 0
# 0, so all values have been imputed
# Predict Sale Price for test_data from our multiple linear regression model
test_data$SalePrice <- predict(train.lm2, newdata = test_data)
# Create new dataframe of ID's and predicted SalePrice for Kaggle Submission
Kaggle <- test_data[, c('Id', 'SalePrice')]
# Export new dataframe to csv
write.csv(Kaggle, "605final.csv", row.names = FALSE)