You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. Follow the steps below to complete the exam requirements.
library(readr)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(moments)
library(corrplot)
library(stats)
library(MASS)
library(skimr)
library(caret)
library(Metrics)
train.csv) and define that variable as
X. Make sure this variable is skewed to the right.Y.train_data <- read_csv("train.csv")
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(train_data)
## # A tibble: 6 × 81
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
numeric_data <- select_if(train_data, is.numeric)
# Calculate skewness for each numeric column
skewness_data <- sapply(numeric_data, skewness)
# Filter for highly skewed variables (skewness > 1 indicates right skew)
skewed_variables <- skewness_data[skewness_data > 1]
names(skewed_variables)
## [1] "MSSubClass" NA "LotArea" NA
## [5] "BsmtFinSF1" "BsmtFinSF2" "TotalBsmtSF" "1stFlrSF"
## [9] "LowQualFinSF" "GrLivArea" "BsmtHalfBath" "KitchenAbvGr"
## [13] NA "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [17] "3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
## [21] "SalePrice"
X <- train_data$GrLivArea
ggplot(train_data, aes(x = GrLivArea)) + geom_histogram(aes(y = ..density..), binwidth = 50, fill = "blue", alpha = 0.7) + geom_density(color = "red") + labs(title = "Distribution of GrLivArea", x = "Above Ground Living Area (sq ft)", y = "Density") + theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Y <- train_data$SalePrice
Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below.
x <- quantile(X, 0.75) # 3rd quartile for X
y <- quantile(Y, 0.50) # 2nd quartile for Y
P_X_greater_x_given_Y_greater_y <- mean(X > x & Y > y) / mean(Y > y)
P_X_greater_x_and_Y_greater_y <- mean(X > x & Y > y)
P_X_less_x_given_Y_greater_y <- mean(X < x & Y > y) / mean(Y > y)
P_X_greater_x_given_Y_greater_y
## [1] 0.4326923
P_X_greater_x_and_Y_greater_y
## [1] 0.2157534
P_X_less_x_given_Y_greater_y
## [1] 0.5673077
P(X>x | Y>y): Given that house prices are above the median, 43.27% of houses have larger living areas than most.
P(X>x , Y>y): Only 21.58% of all houses have both prices above median and larger-than-typical living areas.
P(X<x | Y>y): When house prices exceed the median, 56.73% of these have living areas smaller than the upper quartile
Larger living areas less common in higher-priced homes; price not solely driven by size.
Create a table of counts as shown below:
| x/y | <=2d quartile | >2d quartile | Total |
|---|---|---|---|
| <=3d quartile | |||
| >3d quartile | |||
| Total |
# Creating a contingency table
count_table <- matrix(ncol = 3, nrow = 3)
colnames(count_table) <- c("<=2nd quartile Y", ">2nd quartile Y", "Total")
rownames(count_table) <- c("<=3rd quartile X", ">3rd quartile X", "Total")
# Filling the table
count_table[1, 1] <- sum(X <= x & Y <= y)
count_table[1, 2] <- sum(X <= x & Y > y)
count_table[2, 1] <- sum(X > x & Y <= y)
count_table[2, 2] <- sum(X > x & Y > y)
count_table[1, 3] <- sum(X <= x)
count_table[2, 3] <- sum(X > x)
count_table[3, 1] <- sum(Y <= y)
count_table[3, 2] <- sum(Y > y)
count_table[3, 3] <- nrow(train_data)
# Printing the table
as.table(count_table)
## <=2nd quartile Y >2nd quartile Y Total
## <=3rd quartile X 682 413 1095
## >3rd quartile X 50 315 365
## Total 732 728 1460
A as the variable counting observations above
the 3rd quartile for X, and B as the variable
counting observations above the 2nd quartile for Y.# Variables for independence check
A <- ifelse(X > x, 1, 0) # 1 if X is above the 3rd quartile
B <- ifelse(Y > y, 1, 0) # 1 if Y is above the 2nd quartile
# Contingency table for chi-squared test
independence_table <- table(A, B)
# Performing chi-squared test
chi_squared_test <- chisq.test(independence_table)
# Checking if probabilities multiply to joint probability
P_A <- mean(A == 1)
P_B <- mean(B == 1)
P_A_and_B <- mean(A == 1 & B == 1)
independent_check <- P_A_and_B == P_A * P_B
# Output results
list(independence = independent_check,chi_squared_test = chi_squared_test
)
## $independence
## [1] FALSE
##
## $chi_squared_test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: independence_table
## X-squared = 256.53, df = 1, p-value < 2.2e-16
The above result shows that A (observations above the 3rd quartile for living area) and 𝐵 B (observations above the 2nd quartile for sale price) are not independent. The mathmatical calculation & statistical tests show a significant association between larger living areas and higher sale prices.
skim(train_data)
| Name | train_data |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
| Street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| Alley | 1369 | 0.06 | 4 | 4 | 0 | 2 | 0 |
| LotShape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| LandContour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| Utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| LotConfig | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| LandSlope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| Neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
| Condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| Condition2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| BldgType | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| HouseStyle | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| RoofStyle | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| RoofMatl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
| Exterior1st | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
| Exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
| MasVnrType | 8 | 0.99 | 4 | 7 | 0 | 4 | 0 |
| ExterQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| ExterCond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| Foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| BsmtQual | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtCond | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtExposure | 38 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| BsmtFinType1 | 37 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| BsmtFinType2 | 38 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| Heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| HeatingQC | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| CentralAir | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| Electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| KitchenQual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| Functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| FireplaceQu | 690 | 0.53 | 2 | 2 | 0 | 5 | 0 |
| GarageType | 81 | 0.94 | 6 | 7 | 0 | 6 | 0 |
| GarageFinish | 81 | 0.94 | 3 | 3 | 0 | 3 | 0 |
| GarageQual | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| GarageCond | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| PavedDrive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| PoolQC | 1453 | 0.00 | 2 | 2 | 0 | 3 | 0 |
| Fence | 1179 | 0.19 | 4 | 5 | 0 | 4 | 0 |
| MiscFeature | 1406 | 0.04 | 4 | 4 | 0 | 4 | 0 |
| SaleType | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
| SaleCondition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| MSSubClass | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
| LotFrontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
| LotArea | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| OverallQual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
| OverallCond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| YearRemodAdd | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| MasVnrArea | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
| BsmtFinSF1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
| BsmtFinSF2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
| BsmtUnfSF | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
| TotalBsmtSF | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| 1stFlrSF | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| 2ndFlrSF | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| LowQualFinSF | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
| GrLivArea | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| BsmtFullBath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| BsmtHalfBath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| FullBath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| HalfBath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| BedroomAbvGr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| KitchenAbvGr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
| TotRmsAbvGrd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| Fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
| GarageYrBlt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
| GarageCars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
| GarageArea | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| WoodDeckSF | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
| OpenPorchSF | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
| EnclosedPorch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
| 3SsnPorch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
| ScreenPorch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
| PoolArea | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
| MiscVal | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
| MoSold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| YrSold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
summary(train_data)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical 1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
numeric_columns <- sapply(train_data, is.numeric)
plot_data <- train_data[,numeric_columns]
par(mfrow=c(3, 3))
for (i in names(plot_data)){
hist(plot_data[[i]],main=paste(i,'Histogram'), xlab=i)}
X and Y.# Scatterplot of X vs. Y
ggplot(train_data, aes(x = GrLivArea, y = SalePrice)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "green") +
labs(title = " SalePrice vs GrLivArea ", x = "GrLivArea (sq ft)", y = "SalePrice ($)")
## `geom_smooth()` using formula = 'y ~ x'
# Scale SalePrice for comparability
scaled_SalePrice <- scale(train_data$SalePrice)
# Calculate mean differences
mean_diff <- mean(train_data$GrLivArea) - mean(scaled_SalePrice)
# Standard error of the difference
se_diff <- sqrt((sd(train_data$GrLivArea)^2 / nrow(train_data)) + (sd(scaled_SalePrice)^2 / nrow(train_data)))
# 95% Confidence Interval
ci_lower <- mean_diff - qt(0.975, df = nrow(train_data) - 1) * se_diff
ci_upper <- mean_diff + qt(0.975, df = nrow(train_data) - 1) * se_diff
list(lower = ci_lower, upper = ci_upper)
## $lower
## [1] 1488.487
##
## $upper
## [1] 1542.44
two_variables <- c("OverallQual", "GrLivArea", "SalePrice")
cor_data <- plot_data[, two_variables]
cor_data
## # A tibble: 1,460 × 3
## OverallQual GrLivArea SalePrice
## <dbl> <dbl> <dbl>
## 1 7 1710 208500
## 2 6 1262 181500
## 3 7 1786 223500
## 4 7 1717 140000
## 5 8 2198 250000
## 6 5 1362 143000
## 7 8 1694 307000
## 8 7 2090 200000
## 9 7 1774 129900
## 10 5 1077 118000
## # ℹ 1,450 more rows
correlation_matrix <- cor(cor_data, use = "complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", addCoef.col = "white",
number.cex = 0.8)
cor_results <- list()
for (col_name in names(cor_data[-3])) {
cor_result <- cor.test(cor_data[['SalePrice']], cor_data[[col_name]], conf.level = 0.99)
cor_results[[col_name]] <- cor_result
}
cor_results
## $OverallQual
##
## Pearson's product-moment correlation
##
## data: cor_data[["SalePrice"]] and cor_data[[col_name]]
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.7643382 0.8149288
## sample estimates:
## cor
## 0.7909816
##
##
## $GrLivArea
##
## Pearson's product-moment correlation
##
## data: cor_data[["SalePrice"]] and cor_data[[col_name]]
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.6733974 0.7406408
## sample estimates:
## cor
## 0.7086245
small p-values (p < 2.2e-16), reject the null.
The correlation for all 2 variables are statistically significant. The correlation falls within the 99% CI
precision_matrix <- solve(correlation_matrix)
precision_matrix
## OverallQual GrLivArea SalePrice
## OverallQual 2.6865350 -0.1753704 -2.000728
## GrLivArea -0.1753704 2.0200794 -1.292763
## SalePrice -2.0007280 -1.2927630 3.498623
OverallQual: VIF = 2.69<5 This variable can generally be kept in the model without much concern about excessive multicollinearity affecting the coefficients. GrLivArea: VIF = 2.02. It suggests that GrLivArea is relatively independent of other variables in your model.
cor_prec = correlation_matrix %*% precision_matrix
prec_cor = precision_matrix %*% correlation_matrix
cor_prec
## OverallQual GrLivArea SalePrice
## OverallQual 1 0 0
## GrLivArea 0 1 0
## SalePrice 0 0 1
prec_cor
## OverallQual GrLivArea SalePrice
## OverallQual 1.000000e+00 -4.440892e-16 -8.881784e-16
## GrLivArea 0.000000e+00 1.000000e+00 0.000000e+00
## SalePrice 4.440892e-16 0.000000e+00 1.000000e+00
cor_prec and prec_cor closely resemble the identity matrix confirms precision matrix is computed correctly.
# Since there are more than 30 columns,focusing PCA on Key Variables provides clearer insights.
top_variables <- c("OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "1stFlrSF", "FullBath")
selected_pca_results <- prcomp(train_data[, top_variables], scale. = TRUE)
# Summarize the PCA results
summary(selected_pca_results)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0405 0.9780 0.9230 0.64859 0.5904 0.3880 0.32924
## Proportion of Variance 0.5948 0.1366 0.1217 0.06009 0.0498 0.0215 0.01549
## Cumulative Proportion 0.5948 0.7314 0.8531 0.91321 0.9630 0.9845 1.00000
# Create a Scree Plot with an elbow line
scree_plot <- function(pca_result) {
variances <- pca_result$sdev^2
proportions <- variances / sum(variances)
cum_proportions <- cumsum(proportions)
plot(cum_proportions, xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b", pch = 19, main = "Cumulative Proportion of Variance Explained with Elbow Line")
# Identify the elbow point (the point where the slope markedly changes)
elbow_point <- which.max(diff(diff(cum_proportions)))
abline(v = elbow_point, col = "red", lwd = 2, lty = 2)
# Highlight the elbow point
points(elbow_point, cum_proportions[elbow_point], col = "red", pch = 19, cex = 1.5)
return(invisible(elbow_point))
}
elbow_point <- scree_plot(selected_pca_results)
PCA insights: The principal components before the elbow (PC1 to PC4) are the most informative,(explain more than 90% of variance in SalePrice) as they capture the majority of the variance in the dataset.
fitdistr to fit an
exponential probability density function.fit_result <- fitdistr(train_data$GrLivArea, "exponential")
fit_result
## rate
## 6.598640e-04
## (1.726943e-05)
rexp(1000, λ)).lambda <- fit_result$estimate["rate"]
lambda
## rate
## 0.000659864
exp_samples <- rexp(1000, rate = lambda)
# Plotting histograms
par(mfrow = c(1, 2))
hist(train_data$GrLivArea,, main = "Original GrLivArea")
hist(exp_samples, main = "Exponential Sample")
par(mfrow = c(1, 1))
pi_5 <- qexp(0.05, rate = lambda)
pi_95 <- qexp(0.95, rate = lambda)
mean_samples <- mean(exp_samples)
std_error_samples <- sd(exp_samples) / sqrt(length(exp_samples))
# Generate a 95% confidence interval
CI <- c(mean_samples - 1.96 * std_error_samples, mean_samples + 1.96 * std_error_samples)
exp_pi_5 <- quantile(exp_samples, 0.05)
exp_pi_95 <- quantile(exp_samples, 0.95)
# Print results
cat("5th and 95th percentiles from fitted exponential:", pi_5, ",", pi_95, "\n")
## 5th and 95th percentiles from fitted exponential: 77.73313 , 4539.924
cat("95% Confidence Interval from the empirical data:", CI)
## 95% Confidence Interval from the empirical data: 1527.027 1725.238
cat("Empirical 5th & 95th percentile of the data:", exp_pi_5, ",", exp_pi_95)
## Empirical 5th & 95th percentile of the data: 79.17522 , 4744.321
The result confirms, the peak is near the minimum and tails off quickly, revealing that smaller spaces are far more common than larger ones.
# Rename columns to make syntactically valid names in R
names(train_data) <- make.names(names(train_data))
# Identify numeric and character features
numeric_features <- sapply(train_data, is.numeric)
character_features <- sapply(train_data, is.character)
# Fill missing values for numeric and character columns
train_data[numeric_features] <- lapply(train_data[numeric_features], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- -999 }
return(x)
})
train_data[character_features] <- lapply(train_data[character_features], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- "Others" }
return(x)
})
selected_vars <- train_data[, c("OverallQual", "GrLivArea", "GarageCars", "GarageArea", "TotalBsmtSF", "X1stFlrSF", "FullBath", "SalePrice")]
# Split the data into training and validation sets
set.seed(123)
train_index <- createDataPartition(selected_vars$SalePrice, p = 0.8, list = FALSE)
X_train <- selected_vars[train_index, -8]
y_train <- selected_vars[train_index, 8]
X_val <- selected_vars[-train_index, -8]
y_val <- selected_vars[-train_index, 8]
train_data_for_slr <- data.frame(GrLivArea = X_train$GrLivArea, SalePrice = y_train)
slr <- lm(SalePrice ~ GrLivArea, data = train_data_for_slr)
summary(slr)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea, data = train_data_for_slr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -457901 -30333 -888 21700 280475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19961.333 5034.287 3.965 7.78e-05 ***
## GrLivArea 105.980 3.167 33.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54460 on 1167 degrees of freedom
## Multiple R-squared: 0.4896, Adjusted R-squared: 0.4892
## F-statistic: 1120 on 1 and 1167 DF, p-value: < 2.2e-16
slr_predictions <- predict(slr, newdata = data.frame(GrLivArea = X_val$GrLivArea))
slr_rmse <- rmse(y_val$SalePrice, slr_predictions)
print(paste("RMSE for Simple Linear Regression:", slr_rmse))
## [1] "RMSE for Simple Linear Regression: 62153.5420191601"
train_data_for_mlr <- data.frame(X_train, SalePrice = y_train)
mlr <- lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea + TotalBsmtSF + `X1stFlrSF` + FullBath, data = train_data_for_mlr)
summary(mlr)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + X1stFlrSF + FullBath, data = train_data_for_mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442048 -20272 -1481 16672 227847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.006e+05 5.233e+03 -19.233 < 2e-16 ***
## OverallQual 2.414e+04 1.159e+03 20.820 < 2e-16 ***
## GrLivArea 3.772e+01 3.151e+00 11.970 < 2e-16 ***
## GarageCars 1.412e+04 3.147e+03 4.486 7.98e-06 ***
## GarageArea 2.427e+01 1.079e+01 2.249 0.0247 *
## TotalBsmtSF 2.062e+01 4.460e+00 4.623 4.21e-06 ***
## X1stFlrSF 1.143e+01 5.265e+00 2.170 0.0302 *
## FullBath 3.112e+03 2.650e+03 1.174 0.2405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36440 on 1161 degrees of freedom
## Multiple R-squared: 0.7727, Adjusted R-squared: 0.7713
## F-statistic: 563.7 on 7 and 1161 DF, p-value: < 2.2e-16
mlr_predictions <- predict(mlr, newdata = X_val)
mlr_rmse <- rmse(y_val$SalePrice, mlr_predictions)
print(paste("RMSE for Multiple Linear Regression:", mlr_rmse))
## [1] "RMSE for Multiple Linear Regression: 47634.810484816"
train_data[character_features] <- lapply(train_data[character_features], as.factor)
full_model <- lm(as.formula(paste("SalePrice ~", paste(setdiff(names(train_data), c("Id", "SalePrice")), collapse = " + "))), data = train_data)
# Extracting statistically significant variables (p-value <= 0.05)
significant_vars <- summary(full_model)$coefficients[summary(full_model)$coefficients[, "Pr(>|t|)"] <= 0.05, ]
significant_vars <- rownames(significant_vars)
available_vars <- names(train_data)[names(train_data) %in% significant_vars]
final_model_formula <- paste("SalePrice ~", paste(available_vars, collapse = " + "))
final_model <- lm(as.formula(final_model_formula), data = train_data)
summary(final_model)
##
## Call:
## lm(formula = as.formula(final_model_formula), data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -537081 -16309 -2255 12838 292293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.550e+05 8.946e+04 -9.557 < 2e-16 ***
## LotArea 4.700e-01 1.033e-01 4.549 5.83e-06 ***
## OverallQual 1.880e+04 1.175e+03 15.998 < 2e-16 ***
## OverallCond 5.219e+03 9.535e+02 5.474 5.19e-08 ***
## YearBuilt 3.986e+02 4.537e+01 8.785 < 2e-16 ***
## MasVnrArea 1.881e+01 5.289e+00 3.556 0.000388 ***
## BsmtFinSF1 2.216e+01 4.306e+00 5.146 3.03e-07 ***
## BsmtFinSF2 8.968e+00 7.029e+00 1.276 0.202217
## BsmtUnfSF 9.614e+00 4.288e+00 2.242 0.025115 *
## X1stFlrSF 6.478e+01 5.133e+00 12.619 < 2e-16 ***
## X2ndFlrSF 5.464e+01 3.381e+00 16.160 < 2e-16 ***
## BedroomAbvGr -5.362e+03 1.484e+03 -3.614 0.000312 ***
## KitchenAbvGr -1.582e+04 4.724e+03 -3.348 0.000835 ***
## Fireplaces 3.585e+03 1.792e+03 2.001 0.045604 *
## GarageArea 3.423e+01 5.918e+00 5.783 8.97e-09 ***
## WoodDeckSF 2.719e+01 8.144e+00 3.339 0.000864 ***
## ScreenPorch 4.835e+01 1.751e+01 2.761 0.005835 **
## PoolArea -5.420e+01 2.407e+01 -2.252 0.024476 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35970 on 1442 degrees of freedom
## Multiple R-squared: 0.7974, Adjusted R-squared: 0.795
## F-statistic: 333.9 on 17 and 1442 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(final_model)
The Simple Linear Regression (SLR) utilizing ‘GrLivArea’ explains about 48.96% of the variance in ‘SalePrice’, with a notable RMSE of 62,153. Progressing to the Multiple Linear Regression (MLR), which incorporates variables like ‘OverallQual’, ‘GarageCars’, and ‘TotalBsmtSF’, enhances the R-squared to 77.27%, reducing the RMSE to 47,635. Finally, our comprehensive model, which filters only statistically significant predictors, achieves an R-squared of 79.74%.This final model, with its robust predictive capability, will be utilized for predictions on the test dataset.
# Load test data
test_data <- read_csv("test.csv")
## Rows: 1459 Columns: 80
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (37): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename columns to make syntactically valid names in R
names(test_data) <- make.names(names(test_data))
# Identify numeric and character features in test_data
numeric_features_test <- sapply(test_data, is.numeric)
character_features_test <- sapply(test_data, is.character)
# Impute missing values for numeric and character columns
test_data[numeric_features_test] <- lapply(test_data[numeric_features_test], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- -999 }
return(x)
})
test_data[character_features_test] <- lapply(test_data[character_features_test], function(x) {
if (any(is.na(x))) { x[is.na(x)] <- "Others" }
return(x)
})
# Convert character features to factors based on training data treatment
test_data[character_features_test] <- lapply(test_data[character_features_test], as.factor)
# Predict SalePrice using the final model
test_data$SalePrice <- predict(final_model, newdata = test_data)
# Create submission data frame
submission <- test_data[, c("Id", "SalePrice")]
# Write submission to a CSV file
write.csv(submission, "submission.csv", row.names = FALSE)