You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. Follow the steps below to complete the exam requirements.
library(readr)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(moments)
library(corrplot)
library(stats)
library(MASS)
library(skimr)
library(caret)
library(Metrics)
train.csv
) and define that variable as
X
. Make sure this variable is skewed to the right.Y
.train_data <- read_csv("train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train_data)
## # A tibble: 6 × 81
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
numeric_data <- select_if(train_data, is.numeric)
# Calculate skewness for each numeric column
skewness_data <- sapply(numeric_data, skewness)
# Filter for highly skewed variables (skewness > 1 indicates right skew)
skewed_variables <- skewness_data[skewness_data > 1]
names(skewed_variables)
## [1] "MSSubClass" NA "LotArea" NA
## [5] "BsmtFinSF1" "BsmtFinSF2" "TotalBsmtSF" "1stFlrSF"
## [9] "LowQualFinSF" "GrLivArea" "BsmtHalfBath" "KitchenAbvGr"
## [13] NA "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [17] "3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
## [21] "SalePrice"
X <- train_data$GrLivArea
ggplot(train_data, aes(x = GrLivArea)) + geom_histogram(aes(y = ..density..), binwidth = 50, fill = "blue", alpha = 0.7) + geom_density(color = "red") + labs(title = "Distribution of GrLivArea", x = "Above Ground Living Area (sq ft)", y = "Density") + theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Y <- train_data$SalePrice
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
x <- quantile(X, 0.75) # 3rd quartile for X
y <- quantile(Y, 0.50) # 2nd quartile for Y
P_X_greater_x_given_Y_greater_y <- mean(X > x & Y > y) / mean(Y > y)
P_X_greater_x_and_Y_greater_y <- mean(X > x & Y > y)
P_X_less_x_given_Y_greater_y <- mean(X < x & Y > y) / mean(Y > y)
P_X_greater_x_given_Y_greater_y
## [1] 0.4326923
P_X_greater_x_and_Y_greater_y
## [1] 0.2157534
P_X_less_x_given_Y_greater_y
## [1] 0.5673077
P(X>x | Y>y): Given that house prices are above the median, 43.27% of houses have larger living areas than most.
P(X>x , Y>y): Only 21.58% of all houses have both prices above median and larger-than-typical living areas.
P(X<x | Y>y): When house prices exceed the median, 56.73% of these have living areas smaller than the upper quartile
Larger living areas less common in higher-priced homes; price not solely driven by size.
Create a table of counts as shown below:
x/y | <=2d quartile | >2d quartile | Total |
---|---|---|---|
<=3d quartile | |||
>3d quartile | |||
Total |
# Creating a contingency table
count_table <- matrix(ncol = 3, nrow = 3)
colnames(count_table) <- c("<=2nd quartile Y", ">2nd quartile Y", "Total")
rownames(count_table) <- c("<=3rd quartile X", ">3rd quartile X", "Total")
# Filling the table
count_table[1, 1] <- sum(X <= x & Y <= y)
count_table[1, 2] <- sum(X <= x & Y > y)
count_table[2, 1] <- sum(X > x & Y <= y)
count_table[2, 2] <- sum(X > x & Y > y)
count_table[1, 3] <- sum(X <= x)
count_table[2, 3] <- sum(X > x)
count_table[3, 1] <- sum(Y <= y)
count_table[3, 2] <- sum(Y > y)
count_table[3, 3] <- nrow(train_data)
# Printing the table
as.table(count_table)
## <=2nd quartile Y >2nd quartile Y Total
## <=3rd quartile X 682 413 1095
## >3rd quartile X 50 315 365
## Total 732 728 1460
A
as the variable counting observations above
the 3rd quartile for X
, and B
as the variable
counting observations above the 2nd quartile for Y
.# Variables for independence check
A <- ifelse(X > x, 1, 0) # 1 if X is above the 3rd quartile
B <- ifelse(Y > y, 1, 0) # 1 if Y is above the 2nd quartile
# Contingency table for chi-squared test
independence_table <- table(A, B)
# Performing chi-squared test
chi_squared_test <- chisq.test(independence_table)
# Checking if probabilities multiply to joint probability
P_A <- mean(A == 1)
P_B <- mean(B == 1)
P_A_and_B <- mean(A == 1 & B == 1)
independent_check <- P_A_and_B == P_A * P_B
# Output results
list(independence = independent_check,chi_squared_test = chi_squared_test
)
## $independence
## [1] FALSE
##
## $chi_squared_test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: independence_table
## X-squared = 256.53, df = 1, p-value < 2.2e-16
The above result shows that A (observations above the 3rd quartile for living area) and 𝐵 B (observations above the 2nd quartile for sale price) are not independent. The mathmatical calculation & statistical tests show a significant association between larger living areas and higher sale prices.
skim(train_data)
Name | train_data |
Number of rows | 1460 |
Number of columns | 81 |
_______________________ | |
Column type frequency: | |
character | 43 |
numeric | 38 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
MSZoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
Street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
Alley | 1369 | 0.06 | 4 | 4 | 0 | 2 | 0 |
LotShape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
LandContour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
Utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
LotConfig | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
LandSlope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
Neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
Condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
Condition2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
BldgType | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
HouseStyle | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
RoofStyle | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
RoofMatl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
Exterior1st | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
Exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
MasVnrType | 8 | 0.99 | 4 | 7 | 0 | 4 | 0 |
ExterQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
ExterCond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
Foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
BsmtQual | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtCond | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtExposure | 38 | 0.97 | 2 | 2 | 0 | 4 | 0 |
BsmtFinType1 | 37 | 0.97 | 3 | 3 | 0 | 6 | 0 |
BsmtFinType2 | 38 | 0.97 | 3 | 3 | 0 | 6 | 0 |
Heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
HeatingQC | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
CentralAir | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
Electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
KitchenQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
Functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
FireplaceQu | 690 | 0.53 | 2 | 2 | 0 | 5 | 0 |
GarageType | 81 | 0.94 | 6 | 7 | 0 | 6 | 0 |
GarageFinish | 81 | 0.94 | 3 | 3 | 0 | 3 | 0 |
GarageQual | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
GarageCond | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
PavedDrive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
PoolQC | 1453 | 0.00 | 2 | 2 | 0 | 3 | 0 |
Fence | 1179 | 0.19 | 4 | 5 | 0 | 4 | 0 |
MiscFeature | 1406 | 0.04 | 4 | 4 | 0 | 4 | 0 |
SaleType | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
SaleCondition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
summary(train_data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
numeric_columns <- sapply(train_data, is.numeric)
plot_data <- train_data[,numeric_columns]
par(mfrow=c(3, 3))
for (i in names(plot_data)){
hist(plot_data[[i]],main=paste(i,'Histogram'), xlab=i)}
X
and Y
.# Scatterplot of X vs. Y
ggplot(train_data, aes(x = GrLivArea, y = SalePrice)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "green") +
labs(title = " SalePrice vs GrLivArea ", x = "GrLivArea (sq ft)", y = "SalePrice ($)")
## `geom_smooth()` using formula = 'y ~ x'
# Scale SalePrice for comparability
scaled_SalePrice <- scale(train_data$SalePrice)
# Calculate mean differences
mean_diff <- mean(train_data$GrLivArea) - mean(scaled_SalePrice)
# Standard error of the difference
se_diff <- sqrt((sd(train_data$GrLivArea)^2 / nrow(train_data)) + (sd(scaled_SalePrice)^2 / nrow(train_data)))
# 95% Confidence Interval
ci_lower <- mean_diff - qt(0.975, df = nrow(train_data) - 1) * se_diff
ci_upper <- mean_diff + qt(0.975, df = nrow(train_data) - 1) * se_diff
list(lower = ci_lower, upper = ci_upper)
## $lower
## [1] 1488.487
##
## $upper
## [1] 1542.44
two_variables <- c("OverallQual", "GrLivArea", "SalePrice")
cor_data <- plot_data[, two_variables]
cor_data
## # A tibble: 1,460 × 3
## OverallQual GrLivArea SalePrice
## <dbl> <dbl> <dbl>
## 1 7 1710 208500
## 2 6 1262 181500
## 3 7 1786 223500
## 4 7 1717 140000
## 5 8 2198 250000
## 6 5 1362 143000
## 7 8 1694 307000
## 8 7 2090 200000
## 9 7 1774 129900
## 10 5 1077 118000
## # ℹ 1,450 more rows
correlation_matrix <- cor(cor_data, use = "complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", addCoef.col = "white",
number.cex = 0.8)
cor_results <- list()
for (col_name in names(cor_data[-3])) {
cor_result <- cor.test(cor_data[['SalePrice']], cor_data[[col_name]], conf.level = 0.99)
cor_results[[col_name]] <- cor_result
}
cor_results
## $OverallQual
##
## Pearson's product-moment correlation
##
## data: cor_data[["SalePrice"]] and cor_data[[col_name]]
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.7643382 0.8149288
## sample estimates:
## cor
## 0.7909816
##
##
## $GrLivArea
##
## Pearson's product-moment correlation
##
## data: cor_data[["SalePrice"]] and cor_data[[col_name]]
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.6733974 0.7406408
## sample estimates:
## cor
## 0.7086245
small p-values (p < 2.2e-16), reject the null.
The correlation for all 2 variables are statistically significant. The correlation falls within the 99% CI
precision_matrix <- solve(correlation_matrix)
precision_matrix
## OverallQual GrLivArea SalePrice
## OverallQual 2.6865350 -0.1753704 -2.000728
## GrLivArea -0.1753704 2.0200794 -1.292763
## SalePrice -2.0007280 -1.2927630 3.498623
OverallQual: VIF = 2.69<5 This variable can generally be kept in the model without much concern about excessive multicollinearity affecting the coefficients. GrLivArea: VIF = 2.02. It suggests that GrLivArea is relatively independent of other variables in your model.
cor_prec = correlation_matrix %*% precision_matrix
prec_cor = precision_matrix %*% correlation_matrix
cor_prec
## OverallQual GrLivArea SalePrice
## OverallQual 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
prec_cor
## OverallQual GrLivArea SalePrice
## OverallQual 1.000000e+00 -4.440892e-16 -8.881784e-16
## GrLivArea 0.000000e+00 1.000000e+00 0.000000e+00
## SalePrice 4.440892e-16 0.000000e+00 1.000000e+00
cor_prec and prec_cor closely resemble the identity matrix confirms precision matrix is computed correctly.
# Since there are more than 30 columns,focusing PCA on Key Variables provides clearer insights.
top_variables <- c("OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "1stFlrSF", "FullBath")
selected_pca_results <- prcomp(train_data[, top_variables], scale. = TRUE)
# Summarize the PCA results
summary(selected_pca_results)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0405 0.9780 0.9230 0.64859 0.5904 0.3880 0.32924
## Proportion of Variance 0.5948 0.1366 0.1217 0.06009 0.0498 0.0215 0.01549
## Cumulative Proportion 0.5948 0.7314 0.8531 0.91321 0.9630 0.9845 1.00000
# Create a Scree Plot with an elbow line
scree_plot <- function(pca_result) {
variances <- pca_result$sdev^2
proportions <- variances / sum(variances)
cum_proportions <- cumsum(proportions)
plot(cum_proportions, xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b", pch = 19, main = "Cumulative Proportion of Variance Explained with Elbow Line")
# Identify the elbow point (the point where the slope markedly changes)
elbow_point <- which.max(diff(diff(cum_proportions)))
abline(v = elbow_point, col = "red", lwd = 2, lty = 2)
# Highlight the elbow point
points(elbow_point, cum_proportions[elbow_point], col = "red", pch = 19, cex = 1.5)
return(invisible(elbow_point))
}
elbow_point <- scree_plot(selected_pca_results)
PCA insights: The principal components before the elbow (PC1 to PC4) are the most informative,(explain more than 90% of variance in SalePrice) as they capture the majority of the variance in the dataset.
fitdistr
to fit an
exponential probability density function.fit_result <- fitdistr(train_data$GrLivArea, "exponential")
fit_result
## rate
## 6.598640e-04
## (1.726943e-05)
rexp(1000, λ)
).lambda <- fit_result$estimate["rate"]
lambda
## rate
## 0.000659864
exp_samples <- rexp(1000, rate = lambda)
# Plotting histograms
par(mfrow = c(1, 2))
hist(train_data$GrLivArea,, main = "Original GrLivArea")
hist(exp_samples, main = "Exponential Sample")
par(mfrow = c(1, 1))
pi_5 <- qexp(0.05, rate = lambda)
pi_95 <- qexp(0.95, rate = lambda)
mean_samples <- mean(exp_samples)
std_error_samples <- sd(exp_samples) / sqrt(length(exp_samples))
# Generate a 95% confidence interval
CI <- c(mean_samples - 1.96 * std_error_samples, mean_samples + 1.96 * std_error_samples)
exp_pi_5 <- quantile(exp_samples, 0.05)
exp_pi_95 <- quantile(exp_samples, 0.95)
# Print results
cat("5th and 95th percentiles from fitted exponential:", pi_5, ",", pi_95, "\n")
## 5th and 95th percentiles from fitted exponential: 77.73313 , 4539.924
cat("95% Confidence Interval from the empirical data:", CI)
## 95% Confidence Interval from the empirical data: 1527.027 1725.238
cat("Empirical 5th & 95th percentile of the data:", exp_pi_5, ",", exp_pi_95)
## Empirical 5th & 95th percentile of the data: 79.17522 , 4744.321
The result confirms, the peak is near the minimum and tails off quickly, revealing that smaller spaces are far more common than larger ones.
# Rename columns to make syntactically valid names in R
names(train_data) <- make.names(names(train_data))
# Identify numeric and character features
numeric_features <- sapply(train_data, is.numeric)
character_features <- sapply(train_data, is.character)
# Fill missing values for numeric and character columns
train_data[numeric_features] <- lapply(train_data[numeric_features], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- -999 }
return(x)
})
train_data[character_features] <- lapply(train_data[character_features], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- "Others" }
return(x)
})
selected_vars <- train_data[, c("OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "X1stFlrSF", "FullBath", "SalePrice")]
# Split the data into training and validation sets
set.seed(123)
train_index <- createDataPartition(selected_vars$SalePrice, p = 0.8, list = FALSE)
X_train <- selected_vars[train_index, -8]
y_train <- selected_vars[train_index, 8]
X_val <- selected_vars[-train_index, -8]
y_val <- selected_vars[-train_index, 8]
train_data_for_slr <- data.frame(GrLivArea = X_train$GrLivArea, SalePrice = y_train)
slr <- lm(SalePrice ~ GrLivArea, data = train_data_for_slr)
summary(slr)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea, data = train_data_for_slr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -457901 -30333 -888 21700 280475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19961.333 5034.287 3.965 7.78e-05 ***
## GrLivArea 105.980 3.167 33.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54460 on 1167 degrees of freedom
## Multiple R-squared: 0.4896, Adjusted R-squared: 0.4892
## F-statistic: 1120 on 1 and 1167 DF, p-value: < 2.2e-16
slr_predictions <- predict(slr, newdata = data.frame(GrLivArea = X_val$GrLivArea))
slr_rmse <- rmse(y_val$SalePrice, slr_predictions)
print(paste("RMSE for Simple Linear Regression:", slr_rmse))
## [1] "RMSE for Simple Linear Regression: 62153.5420191601"
train_data_for_mlr <- data.frame(X_train, SalePrice = y_train)
mlr <- lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + `X1stFlrSF` + FullBath, data = train_data_for_mlr)
summary(mlr)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + X1stFlrSF + FullBath, data = train_data_for_mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442048 -20272 -1481 16672 227847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.006e+05 5.233e+03 -19.233 < 2e-16 ***
## OverallQual 2.414e+04 1.159e+03 20.820 < 2e-16 ***
## GrLivArea 3.772e+01 3.151e+00 11.970 < 2e-16 ***
## GarageCars 1.412e+04 3.147e+03 4.486 7.98e-06 ***
## GarageArea 2.427e+01 1.079e+01 2.249 0.0247 *
## TotalBsmtSF 2.062e+01 4.460e+00 4.623 4.21e-06 ***
## X1stFlrSF 1.143e+01 5.265e+00 2.170 0.0302 *
## FullBath 3.112e+03 2.650e+03 1.174 0.2405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36440 on 1161 degrees of freedom
## Multiple R-squared: 0.7727, Adjusted R-squared: 0.7713
## F-statistic: 563.7 on 7 and 1161 DF, p-value: < 2.2e-16
mlr_predictions <- predict(mlr, newdata = X_val)
mlr_rmse <- rmse(y_val$SalePrice, mlr_predictions)
print(paste("RMSE for Multiple Linear Regression:", mlr_rmse))
## [1] "RMSE for Multiple Linear Regression: 47634.810484816"
train_data[character_features] <- lapply(train_data[character_features], as.factor)
full_model <- lm(as.formula(paste("SalePrice ~", paste(setdiff(names(train_data), c("Id", "SalePrice")), collapse = " + "))), data = train_data)
# Extracting statistically significant variables (p-value <= 0.05)
significant_vars <- summary(full_model)$coefficients[summary(full_model)$coefficients[, "Pr(>|t|)"] <= 0.05, ]
significant_vars <- rownames(significant_vars)
available_vars <- names(train_data)[names(train_data) %in% significant_vars]
final_model_formula <- paste("SalePrice ~", paste(available_vars, collapse = " + "))
final_model <- lm(as.formula(final_model_formula), data = train_data)
summary(final_model)
##
## Call:
## lm(formula = as.formula(final_model_formula), data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -537081 -16309 -2255 12838 292293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.550e+05 8.946e+04 -9.557 < 2e-16 ***
## LotArea 4.700e-01 1.033e-01 4.549 5.83e-06 ***
## OverallQual 1.880e+04 1.175e+03 15.998 < 2e-16 ***
## OverallCond 5.219e+03 9.535e+02 5.474 5.19e-08 ***
## YearBuilt 3.986e+02 4.537e+01 8.785 < 2e-16 ***
## MasVnrArea 1.881e+01 5.289e+00 3.556 0.000388 ***
## BsmtFinSF1 2.216e+01 4.306e+00 5.146 3.03e-07 ***
## BsmtFinSF2 8.968e+00 7.029e+00 1.276 0.202217
## BsmtUnfSF 9.614e+00 4.288e+00 2.242 0.025115 *
## X1stFlrSF 6.478e+01 5.133e+00 12.619 < 2e-16 ***
## X2ndFlrSF 5.464e+01 3.381e+00 16.160 < 2e-16 ***
## BedroomAbvGr -5.362e+03 1.484e+03 -3.614 0.000312 ***
## KitchenAbvGr -1.582e+04 4.724e+03 -3.348 0.000835 ***
## Fireplaces 3.585e+03 1.792e+03 2.001 0.045604 *
## GarageArea 3.423e+01 5.918e+00 5.783 8.97e-09 ***
## WoodDeckSF 2.719e+01 8.144e+00 3.339 0.000864 ***
## ScreenPorch 4.835e+01 1.751e+01 2.761 0.005835 **
## PoolArea -5.420e+01 2.407e+01 -2.252 0.024476 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35970 on 1442 degrees of freedom
## Multiple R-squared: 0.7974, Adjusted R-squared: 0.795
## F-statistic: 333.9 on 17 and 1442 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(final_model)
The Simple Linear Regression (SLR) utilizing ‘GrLivArea’ explains about 48.96% of the variance in ‘SalePrice’, with a notable RMSE of 62,153. Progressing to the Multiple Linear Regression (MLR), which incorporates variables like ‘OverallQual’, ‘GarageCars’, and ‘TotalBsmtSF’, enhances the R-squared to 77.27%, reducing the RMSE to 47,635. Finally, our comprehensive model, which filters only statistically significant predictors, achieves an R-squared of 79.74%.This final model, with its robust predictive capability, will be utilized for predictions on the test dataset.
# Load test data
test_data <- read_csv("test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename columns to make syntactically valid names in R
names(test_data) <- make.names(names(test_data))
# Identify numeric and character features in test_data
numeric_features_test <- sapply(test_data, is.numeric)
character_features_test <- sapply(test_data, is.character)
# Impute missing values for numeric and character columns
test_data[numeric_features_test] <- lapply(test_data[numeric_features_test], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- -999 }
return(x)
})
test_data[character_features_test] <- lapply(test_data[character_features_test], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- "Others" }
return(x)
})
# Convert character features to factors based on training data treatment
test_data[character_features_test] <- lapply(test_data[character_features_test], as.factor)
# Predict SalePrice using the final model
test_data$SalePrice <- predict(final_model, newdata = test_data)
# Create submission data frame
submission <- test_data[, c("Id", "SalePrice")]
# Write submission to a CSV file
write.csv(submission, "submission.csv", row.names = FALSE)