The House prices data set from Kaggle(https://www.kaggle.com/c/house-prices-advanced-regression-techniques) competition 80 variables for training set and 81 for testing set of possible sales made in Ames, Iowa. .
The house prices data set has some missing values and we are going to replace them with zero.
#load library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
library(corrplot)
## corrplot 0.92 loaded
library(rsample)
library(caret)
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# Arrange plots in a grid
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
#load the data
#load data
train = read.csv("train.csv",stringsAsFactors = F)
test = read.csv("test.csv",stringsAsFactors = F)
#summary
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
sum(is.na(train))
## [1] 6965
#checking value with na
colSums(is.na(train))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
# checking value with na for test
colSums(is.na(test))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 4 227 0
## Street Alley LotShape LandContour Utilities
## 0 1352 0 0 2
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 1 1
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 16 15 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 44 45 44 42 1
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 42 1 1 1 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 0 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 2 2 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 1 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 2 0 730 76 78
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 78 1 1 78 78
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1456 1169 1408
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 1 0
## fill na with zero
train[is.na(train)] <- 0
test[is.na(test)] <- 0
dim(train)
## [1] 1460 81
dim(test)
## [1] 1459 80
# selecxt x and y \
X <- train$GrLivArea
Y <- train$SalePrice
# Load the required library
library(ggplot2)
# Create a histogram for X (independent variable)
ggplot(data = NULL, aes(x = X)) +
geom_histogram(binwidth = 50, fill = "lightblue", color = "black") +
labs(title = "Histogram of GrLivArea (Independent Variable)",
x = "GrLivArea",
y = "Frequency") +
theme_minimal()
# Create a density plot for Y (dependent variable)
ggplot(data = NULL, aes(x = Y)) +
geom_density(fill = "lightgreen", color = "black") +
labs(title = "Density Plot of SalePrice (Dependent Variable)",
x = "SalePrice",
y = "Density") +
theme_minimal()
Lets Calculate as a minimum the below probabilities a through c. Assume the small letter “x” is estimated as the 3d quartile of the X variable, and the small letter “y” is estimated as the 2d quartile of the Y variable. Interpret the meaning of all probabilities. In addition, make a table of counts as shown below. a. $ P(X>x | Y>y)$ b. $ P(X>x, Y>y)$ c. $ P(X<x | Y>y) $
# Calculate the 2nd quartile of X and Y
x_quartile2 <- quantile(X, probs = 0.50, na.rm = TRUE)
y_quartile2 <- quantile(Y, probs = 0.50, na.rm = TRUE)
# Calculate the 3rd quartile of X
x_quartile3 <- quantile(X, probs = 0.75, na.rm = TRUE)
# Categorize observations based on quartiles
x_leq_2d_quartile <- sum(X <= x_quartile2)
x_gt_2d_quartile <- sum(X > x_quartile2)
x_leq_3d_quartile <- sum(X <= x_quartile3)
x_gt_3d_quartile <- sum(X > x_quartile3)
# Calculate counts for each category
leq_2d_quartile_leq_3d_quartile <- sum(X <= x_quartile2 & Y <= y_quartile2)
leq_2d_quartile_gt_3d_quartile <- sum(X <= x_quartile2 & Y > y_quartile2)
gt_2d_quartile_leq_3d_quartile <- sum(X > x_quartile2 & Y <= y_quartile2)
gt_2d_quartile_gt_3d_quartile <- sum(X > x_quartile2 & Y > y_quartile2)
# Calculate totals
total_leq_3d_quartile <- x_leq_3d_quartile
total_gt_3d_quartile <- x_gt_3d_quartile
total_leq_2d_quartile <- sum(leq_2d_quartile_leq_3d_quartile, leq_2d_quartile_gt_3d_quartile)
total_gt_2d_quartile <- sum(gt_2d_quartile_leq_3d_quartile, gt_2d_quartile_gt_3d_quartile)
total <- total_leq_3d_quartile + total_gt_3d_quartile
# Fill out the table
table_counts <- matrix(c(
leq_2d_quartile_leq_3d_quartile, leq_2d_quartile_gt_3d_quartile, total_leq_2d_quartile,
gt_2d_quartile_leq_3d_quartile, gt_2d_quartile_gt_3d_quartile, total_gt_2d_quartile,
total_leq_3d_quartile, total_gt_3d_quartile, total
), nrow = 3, byrow = TRUE)
# Assign column and row names
colnames(table_counts) <- c("<=2d quartile", ">2d quartile", "Total")
rownames(table_counts) <- c("<=3d quartile", ">3d quartile", "Total")
# Print the table
print(table_counts)
## <=2d quartile >2d quartile Total
## <=3d quartile 577 154 731
## >3d quartile 155 574 729
## Total 1095 365 1460
Definition GrLivArea: Above Ground Living Area, measured in square feet. It encompasses the living area that is not in the basement.
Properties with above ground living area that are inferior or equal to 3rd quartile and salesprices 2nd quartile (577 houses) represent houses that don’t have a living basement. Those houses are placed in a more affordable area.
Properties with above ground living area that are inferior or equal to 3rd quartile and salesprices 2nd quartile (154 houses) are most likely to located in low price or average price Areas based on the detail information available in the data set.
Properties with above ground living area that are superior to 3rd quartile and salesprices 2nd quartile (155 houses) are most like to located in a upcoming neiborhoods where sale prices are increasing.
Properties with above ground living area that are superior to 3rd quartile and salesprices 2nd quartile (574 houses) are most like to the most expensive area to live in because they offer more living space and they are not counted toward the house measurements.
# probability for a b c
cat('a) P(X>x|Y>y) is ',(gt_2d_quartile_gt_3d_quartile/total)/(total_gt_3d_quartile/total),'\n')
## a) P(X>x|Y>y) is 1.572603
cat('b) P(X>x|Y>y) is ',(gt_2d_quartile_gt_3d_quartile/total),'\n')
## b) P(X>x|Y>y) is 0.3931507
cat('c) P(X<x|Y>y) is ',(leq_2d_quartile_gt_3d_quartile/total)/(total_gt_3d_quartile/total),'\n')
## c) P(X<x|Y>y) is 0.4219178
# Calculate probabilities
P_A_given_B <- table_counts[2, 3] / table_counts[3, 2]
P_A <- table_counts[3, 2] / table_counts[3, 3]
P_B <- (table_counts[2, 2] + table_counts[2, 3]) / table_counts[3, 3]
# Check if P(A|B) = P(A)P(B)
P_A_times_P_B <- P_A * P_B
is_independent <- round(P_A_given_B, 6) == round(P_A_times_P_B, 6)
# Print probabilities and whether variables A and B are independent
cat("P(A|B) =", P_A_given_B, "\n")
## P(A|B) = 1.99726
cat("P(A) =", P_A, "\n")
## P(A) = 0.25
cat("P(B) =", P_B, "\n")
## P(B) = 0.8924658
cat("Is P(A|B) equal to P(A)P(B)?", is_independent, "\n")
## Is P(A|B) equal to P(A)P(B)? FALSE
condition_X_greater_x <- train$GrLivArea > x_quartile2
condition_Y_greater_y <- train$SalePrice > y_quartile2
table_A_B <- table(condition_X_greater_x, condition_Y_greater_y)
chisq.test(table_A_B)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table_A_B
## X-squared = 483.29, df = 1, p-value < 2.2e-16
Chi-Square shows that there is a strong relationship existed between GrLivArea and SalesPrice. The Chi Square yield a value of 483.29 and p-value that is approximately equal to 4.118547e-107 which less than p-value < 2.2e-16. We reject the null hypothesis.
# summary of train data set and x and y
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 0.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 42.00
## Median : 730.5 Median : 50.0 Mode :character Median : 63.00
## Mean : 730.5 Mean : 56.9 Mean : 57.62
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 79.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.1
## 3rd Qu.: 164.2
## Max. :1600.0
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. : 0
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1958
## Median :1.000 Mode :character Mode :character Median :1977
## Mean :0.613 Mean :1869
## 3rd Qu.:1.000 3rd Qu.:2001
## Max. :3.000 Max. :2010
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
# summary of x and y
summary(X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
summary(Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
# Create a histogram for X (GrLivArea)
histogram_x <- ggplot(train, aes(x = X)) +
geom_histogram(binwidth = 50, fill = "lightblue", color = "black") +
labs(title = "Histogram of X (GrLivArea)", x = "GrLivArea", y = "Frequency") +
theme_minimal()
# Create a histogram for Y (SalePrice)
histogram_y <- ggplot(train, aes(x = Y)) +
geom_histogram(binwidth = 50000, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Y (SalePrice)", x = "SalePrice", y = "Frequency") +
theme_minimal()
# Create a density plot for X (GrLivArea)
density_x <- ggplot(train, aes(x = X)) +
geom_density(fill = "lightblue", color = "black") +
labs(title = "Density Plot of X (GrLivArea)", x = "GrLivArea", y = "Density") +
theme_minimal()
# Create a density plot for Y (SalePrice)
density_y <- ggplot(train, aes(x = Y)) +
geom_density(fill = "lightgreen", color = "black") +
labs(title = "Density Plot of Y (SalePrice)", x = "SalePrice", y = "Density") +
theme_minimal()
# Create a scatter plot of X vs Y
scatter_plot <- ggplot(train, aes(x = X, y = Y)) +
geom_point(alpha = 0.5, color = "blue") +
labs(title = "Scatter Plot of X vs Y", x = "GrLivArea", y = "SalePrice") +
theme_minimal()
gridExtra::grid.arrange(histogram_x, histogram_y, density_x, density_y, scatter_plot, nrow = 3)
# Step 1: 95% Confidence Interval for the Difference in Mean
mean_X <- mean(train$GrLivArea, na.rm = TRUE)
mean_Y <- mean(train$SalePrice, na.rm = TRUE)
sd_X <- sd(train$GrLivArea, na.rm = TRUE)
sd_Y <- sd(train$SalePrice, na.rm = TRUE)
n_X <- sum(!is.na(train$GrLivArea))
n_Y <- sum(!is.na(train$SalePrice))
diff_mean <- mean_X - mean_Y
se_diff_mean <- sqrt((sd_X^2 / n_X) + (sd_Y^2 / n_Y))
margin_error <- qt(0.975, df = n_X + n_Y - 2) * se_diff_mean
CI_diff_mean <- c(diff_mean - margin_error, diff_mean + margin_error)
# Step 2: Correlation Matrix
correlation_matrix <- cor(train[, c("GrLivArea", "SalePrice")], use = "complete.obs")
# Step 1: Invert the Correlation Matrix
precision_matrix <- solve(correlation_matrix)
# Step 2: Multiply the Correlation Matrix by the Precision Matrix
result1 <- correlation_matrix %*% precision_matrix
# Step 3: Multiply the Precision Matrix by the Correlation Matrix
result2 <- precision_matrix %*% correlation_matrix
# Step 4: Principal Components Analysis (PCA)
pca_result <- prcomp(train[, c("GrLivArea", "SalePrice")], scale. = TRUE)
# Step 5: Interpretation and Discussion
summary(pca_result)
## Importance of components:
## PC1 PC2
## Standard deviation 1.3071 0.5398
## Proportion of Variance 0.8543 0.1457
## Cumulative Proportion 0.8543 1.0000
# Step 3: Hypothesis Test for Correlation
Z <- 0.5 * log((1 + correlation_matrix[1, 2]) / (1 - correlation_matrix[1, 2]))
SE_Z <- 1 / sqrt(n_X - 3)
margin_error_Z <- qnorm(0.995) * SE_Z
CI_correlation <- tanh(c(Z - margin_error_Z, Z + margin_error_Z))
# Print results
cat("95% Confidence Interval for the Difference in Mean:", CI_diff_mean, "\n")
## 95% Confidence Interval for the Difference in Mean: -183482.5 -175329
cat("Correlation Matrix for GrLivArea and SalePrice:", "\n")
## Correlation Matrix for GrLivArea and SalePrice:
print(correlation_matrix)
## GrLivArea SalePrice
## GrLivArea 1.0000000 0.7086245
## SalePrice 0.7086245 1.0000000
cat("Hypothesis Test for Correlation (GrLivArea and SalePrice):", "\n")
## Hypothesis Test for Correlation (GrLivArea and SalePrice):
cat("H0: Correlation = 0, HA: Correlation ≠ 0", "\n")
## H0: Correlation = 0, HA: Correlation ≠ 0
cat("99% Confidence Interval for Correlation:", CI_correlation, "\n")
## 99% Confidence Interval for Correlation: 0.6733974 0.7406408
The univariate plots show us how the data from X and Y are distributed and presence of outlines . The scatterplot visualize the relationships between the the variables of X and Y. the correlation matrix shows a strong relationship among the variables with correlation of 0.70.
The hypothesis test show there is strong relationship among the variables. we re 99% confident that the true correlation coefficient lie with our range.
# Scree plot to visualize variance explained by each principal component
scree_plot <- ggplot(data.frame(PC = 1:length(pca_result$sdev), Variance = pca_result$sdev^2 / sum(pca_result$sdev^2)),
aes(x = PC, y = Variance)) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
labs(title = "Scree Plot", x = "Principal Component", y = "Proportion of Variance Explained") +
theme_minimal()
# Biplot to visualize relationships between original variables and principal components
biplot <- ggbiplot::ggbiplot(pca_result, labels = c("GrLivArea", "SalePrice"),
ellipse = TRUE, circle = TRUE, var.axes = FALSE) +
ggtitle("Biplot of PCA") +
theme_minimal()
# Arrange plots in a grid
library(gridExtra)
grid.arrange(scree_plot, biplot, nrow = 1)
The Scree Plot represent the eigen values obtained by the Principal component Analysis. The biplot shows the relationships between the original variables (X and Y) and the principal components. The direction and length of the arrows represent the strength and direction of the relationship between the variables and the principal components.
For Principal Component 1 (PC1):
We have Standard deviation 1.3071, Proportion of Variance of 0.8543 and a Cumulative Proportion of 0.8543. PCI has a high standard deviation which explain PC1 captures a significant amount of variability in the data set. PC1 could represent the primary factor influencing the observations in the data set. PC2 captures additional, less dominant patterns or trends in the data.
# Step 1: Shift the skewed variable
shifted_variable <- train$GrLivArea- min(train$GrLivArea) + 1
# Step 2: Fit exponential distribution
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit_exp <- fitdistr(shifted_variable, densfun = "exponential")
# Step 3: Obtain the optimal value of λ
lambda <- fit_exp$estimate
# Step 4: Generate 1000 samples from the exponential distribution
samples <- rexp(1000, lambda)
# Step 5: Plot histograms for comparison
par(mfrow = c(1, 2)) # Set up a 1x2 grid for plots
hist(shifted_variable, main = "Original Variable", xlab = "Value", col = "skyblue", border = "white")
hist(samples, main = "Exponential Distribution", xlab = "Value", col = "lightgreen", border = "white")
# Step 6: Calculate percentiles using CDF of exponential distribution
percentile_5 <- qexp(0.05, lambda)
percentile_95 <- qexp(0.95, lambda)
# Step 7: Compute 95% confidence interval from empirical data assuming normality
mean_var <- mean(shifted_variable)
sd_var <- sd(shifted_variable)
CI <- c(mean_var - 1.96 * (sd_var / sqrt(length(shifted_variable))),
mean_var + 1.96 * (sd_var / sqrt(length(shifted_variable))))
# Step 8: Calculate empirical 5th and 95th percentiles
empirical_percentile_5 <- quantile(shifted_variable, 0.05)
empirical_percentile_95 <- quantile(shifted_variable, 0.95)
# Create a data frame
empirical_percentiles_df <- data.frame(Percentile = c("5th", "95th"),
Value = c(empirical_percentile_5, empirical_percentile_95))
empirical_percentiles_df
## Percentile Value
## 5% 5th 515.0
## 95% 95th 2133.1
The exponential data is much more skewed. The simulated data is not a great fit. we need to use better techniques to improve the skewness.
we use correlation to pick up the most relevant features to predict Sale Price of a house. For every one-unit increase in Overall quality and finish of the house (which is likely a rating of overall material and finish of the house), the Sale Price is estimated to increase by \(18,566.79\), holding all other variables constant. House with garages that hold more than one cars and additional storage will increase by \(17,578.73\). the model did pretty good. we were able to achieve a R-squared value of 0.8128 suggests that approximately 81.3% of the variability in Sale Price is explained by the predictor variables included in the model. we see that over all quality of the house, the year the house was built and garage space have significant impacts on the Sale Price of the house.
we discovers some outliers when plotting fitted and residual. we see random individual points that fall far from the main cluster of points which means that these outliers could represent observations that are poorly explained by the model or data points with unusual characteristics that require more investigations.
we were able to acheive 0.67155 on kaggle. we are have to improve. you can use other evaluation techniques or use the Principal Component Analysis (PCA) analysis to to reduce the dimensionality of data while preserving as much variance as possible. it is also a technique that remove these outliers.
# modeling
#see distribution
library(dplyr)
ggplot(train,aes(x=SalePrice, y= GrLivArea))+geom_point()
# preprocessing data
set.seed(123)
# transform data to factor and numeric
training <- train %>% mutate_if(is.character, as.factor)
testing <- test %>% mutate_if(is.character, as.factor)
# look at the dimetion data set
cat('training set has',dim(training),'testing set has ', dim(testing))
## training set has 1460 81 testing set has 1459 80
training %>% glimpse()
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL, …
## $ LotFrontage <dbl> 65, 80, 68, 60, 84, 85, 75, 0, 51, 50, 70, 85, 0, 91, 0,…
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pa…
## $ Alley <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ LotShape <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg, I…
## $ LandContour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, L…
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, …
## $ LotConfig <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corner…
## $ LandSlope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, G…
## $ Neighborhood <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, So…
## $ Condition1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Artery,…
## $ Condition2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Ar…
## $ BldgType <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 2f…
## $ HouseStyle <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story, …
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable, …
## $ RoofMatl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, Co…
## $ Exterior1st <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, Vi…
## $ Exterior2nd <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, Vi…
## $ MasVnrType <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, Ston…
## $ MasVnrArea <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ ExterCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ Foundation <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlock…
## $ BsmtQual <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd, …
## $ BsmtCond <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ BsmtExposure <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av, …
## $ BsmtFinType1 <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec, G…
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2 <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf, U…
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, Ga…
## $ HeatingQC <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex, …
## $ CentralAir <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, …
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd, …
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Min1, Typ, Typ, …
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu <fct> 0, TA, TA, Gd, TA, 0, Gd, TA, TA, TA, 0, Gd, 0, Gd, Fa, …
## $ GarageType <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd, …
## $ GarageYrBlt <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf, F…
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA, …
## $ GarageCond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, …
## $ PavedDrive <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y,…
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fence <fct> 0, 0, 0, 0, 0, MnPrv, 0, 0, 0, 0, 0, 0, 0, 0, GdWo, GdPr…
## $ MiscFeature <fct> 0, 0, 0, 0, 0, Shed, 0, Shed, 0, 0, 0, 0, 0, 0, 0, 0, Sh…
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, New…
## $ SaleCondition <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Normal,…
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …
# check if there is any na
train %>%
summarize_all(~ sum(is.na(.))) %>%
glimpse()
## Rows: 1
## Columns: 81
## $ Id <int> 0
## $ MSSubClass <int> 0
## $ MSZoning <int> 0
## $ LotFrontage <int> 0
## $ LotArea <int> 0
## $ Street <int> 0
## $ Alley <int> 0
## $ LotShape <int> 0
## $ LandContour <int> 0
## $ Utilities <int> 0
## $ LotConfig <int> 0
## $ LandSlope <int> 0
## $ Neighborhood <int> 0
## $ Condition1 <int> 0
## $ Condition2 <int> 0
## $ BldgType <int> 0
## $ HouseStyle <int> 0
## $ OverallQual <int> 0
## $ OverallCond <int> 0
## $ YearBuilt <int> 0
## $ YearRemodAdd <int> 0
## $ RoofStyle <int> 0
## $ RoofMatl <int> 0
## $ Exterior1st <int> 0
## $ Exterior2nd <int> 0
## $ MasVnrType <int> 0
## $ MasVnrArea <int> 0
## $ ExterQual <int> 0
## $ ExterCond <int> 0
## $ Foundation <int> 0
## $ BsmtQual <int> 0
## $ BsmtCond <int> 0
## $ BsmtExposure <int> 0
## $ BsmtFinType1 <int> 0
## $ BsmtFinSF1 <int> 0
## $ BsmtFinType2 <int> 0
## $ BsmtFinSF2 <int> 0
## $ BsmtUnfSF <int> 0
## $ TotalBsmtSF <int> 0
## $ Heating <int> 0
## $ HeatingQC <int> 0
## $ CentralAir <int> 0
## $ Electrical <int> 0
## $ X1stFlrSF <int> 0
## $ X2ndFlrSF <int> 0
## $ LowQualFinSF <int> 0
## $ GrLivArea <int> 0
## $ BsmtFullBath <int> 0
## $ BsmtHalfBath <int> 0
## $ FullBath <int> 0
## $ HalfBath <int> 0
## $ BedroomAbvGr <int> 0
## $ KitchenAbvGr <int> 0
## $ KitchenQual <int> 0
## $ TotRmsAbvGrd <int> 0
## $ Functional <int> 0
## $ Fireplaces <int> 0
## $ FireplaceQu <int> 0
## $ GarageType <int> 0
## $ GarageYrBlt <int> 0
## $ GarageFinish <int> 0
## $ GarageCars <int> 0
## $ GarageArea <int> 0
## $ GarageQual <int> 0
## $ GarageCond <int> 0
## $ PavedDrive <int> 0
## $ WoodDeckSF <int> 0
## $ OpenPorchSF <int> 0
## $ EnclosedPorch <int> 0
## $ X3SsnPorch <int> 0
## $ ScreenPorch <int> 0
## $ PoolArea <int> 0
## $ PoolQC <int> 0
## $ Fence <int> 0
## $ MiscFeature <int> 0
## $ MiscVal <int> 0
## $ MoSold <int> 0
## $ YrSold <int> 0
## $ SaleType <int> 0
## $ SaleCondition <int> 0
## $ SalePrice <int> 0
theme_set(theme_classic())
ggplot(data=training, aes(SalePrice)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(train$SalesPrices)
## Length Class Mode
## 0 NULL NULL
training_numerical <- training %>% select_if(is.numeric)
testing_numerical <- testing %>% select_if(is.numeric)
dim(training_numerical)
## [1] 1460 38
dim(testing_numerical)
## [1] 1459 37
#feature selection
# Calculate the correlation matrix
cor_matrix <- cor(training_numerical)
# Plot the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)
# regression
reg <- lm(SalePrice ~.,data = training_numerical)
summary(reg)
##
## Call:
## lm(formula = SalePrice ~ ., data = training_numerical)
##
## Residuals:
## Min 1Q Median 3Q Max
## -473606 -16047 -2186 14258 298649
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.843e+05 1.401e+06 0.346 0.729585
## Id -1.050e+00 2.168e+00 -0.485 0.628079
## MSSubClass -1.673e+02 2.649e+01 -6.315 3.61e-10 ***
## LotFrontage 8.827e+00 2.852e+01 0.310 0.756985
## LotArea 3.917e-01 9.990e-02 3.921 9.23e-05 ***
## OverallQual 1.731e+04 1.181e+03 14.660 < 2e-16 ***
## OverallCond 5.113e+03 1.023e+03 5.000 6.45e-07 ***
## YearBuilt 3.381e+02 6.044e+01 5.594 2.65e-08 ***
## YearRemodAdd 1.221e+02 6.602e+01 1.849 0.064638 .
## MasVnrArea 2.808e+01 5.918e+00 4.744 2.31e-06 ***
## BsmtFinSF1 1.861e+01 4.632e+00 4.019 6.16e-05 ***
## BsmtFinSF2 9.144e+00 7.003e+00 1.306 0.191873
## BsmtUnfSF 8.506e+00 4.168e+00 2.041 0.041452 *
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.686e+01 5.726e+00 8.183 6.07e-16 ***
## X2ndFlrSF 4.810e+01 4.918e+00 9.779 < 2e-16 ***
## LowQualFinSF 1.779e+01 1.968e+01 0.904 0.366030
## GrLivArea NA NA NA NA
## BsmtFullBath 8.549e+03 2.594e+03 3.295 0.001008 **
## BsmtHalfBath 1.709e+03 4.054e+03 0.421 0.673467
## FullBath 3.234e+03 2.800e+03 1.155 0.248390
## HalfBath -1.913e+03 2.642e+03 -0.724 0.469234
## BedroomAbvGr -1.027e+04 1.680e+03 -6.114 1.25e-09 ***
## KitchenAbvGr -1.576e+04 5.195e+03 -3.033 0.002466 **
## TotRmsAbvGrd 5.005e+03 1.228e+03 4.076 4.84e-05 ***
## Fireplaces 4.075e+03 1.757e+03 2.319 0.020556 *
## GarageYrBlt -1.457e+01 2.683e+00 -5.428 6.67e-08 ***
## GarageCars 1.569e+04 2.974e+03 5.275 1.53e-07 ***
## GarageArea 5.001e+00 9.716e+00 0.515 0.606854
## WoodDeckSF 2.579e+01 7.926e+00 3.254 0.001164 **
## OpenPorchSF -6.285e+00 1.506e+01 -0.417 0.676420
## EnclosedPorch 1.162e+01 1.671e+01 0.695 0.487055
## X3SsnPorch 2.023e+01 3.114e+01 0.650 0.515997
## ScreenPorch 5.771e+01 1.704e+01 3.387 0.000726 ***
## PoolArea -3.215e+01 2.354e+01 -1.366 0.172244
## MiscVal -4.808e-01 1.844e+00 -0.261 0.794330
## MoSold -4.188e+01 3.420e+02 -0.122 0.902546
## YrSold -7.129e+02 6.964e+02 -1.024 0.306203
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34450 on 1424 degrees of freedom
## Multiple R-squared: 0.8164, Adjusted R-squared: 0.8119
## F-statistic: 180.9 on 35 and 1424 DF, p-value: < 2.2e-16
# Select the predictor variables for the regression model
predictors <- c('MSSubClass',
'LotArea',
'OverallQual',
'OverallCond',
'YearBuilt',
'MasVnrArea',
'BsmtFinSF1',
'X1stFlrSF',
'X2ndFlrSF',
'GrLivArea',
'BsmtFullBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'GarageYrBlt',
'GarageCars',
'OpenPorchSF',
'ScreenPorch',
'PoolArea')
# Create a new data frame with the predictor variables and the response variable
regression_data <- training[-1, c(predictors, "SalePrice")]
# Remove rows with missing values
regression_data <- na.omit(regression_data)
# Fit the multiple regression model
model <- lm(SalePrice ~ ., data = regression_data)
# Print the model summary
summary(model)
##
## Call:
## lm(formula = SalePrice ~ ., data = regression_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -475371 -15798 -2237 14500 290412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.379e+05 8.896e+04 -9.419 < 2e-16 ***
## MSSubClass -1.662e+02 2.570e+01 -6.465 1.39e-10 ***
## LotArea 4.604e-01 9.822e-02 4.688 3.02e-06 ***
## OverallQual 1.857e+04 1.114e+03 16.664 < 2e-16 ***
## OverallCond 5.894e+03 9.172e+02 6.427 1.77e-10 ***
## YearBuilt 4.021e+02 4.504e+01 8.929 < 2e-16 ***
## MasVnrArea 2.734e+01 5.842e+00 4.680 3.14e-06 ***
## BsmtFinSF1 1.130e+01 2.951e+00 3.829 0.000134 ***
## X1stFlrSF 3.887e+01 1.987e+01 1.956 0.050685 .
## X2ndFlrSF 3.022e+01 1.963e+01 1.539 0.123919
## GrLivArea 2.073e+01 1.964e+01 1.055 0.291505
## BsmtFullBath 8.904e+03 2.353e+03 3.784 0.000161 ***
## BedroomAbvGr -1.060e+04 1.630e+03 -6.500 1.11e-10 ***
## KitchenAbvGr -1.879e+04 5.043e+03 -3.727 0.000202 ***
## TotRmsAbvGrd 5.076e+03 1.221e+03 4.156 3.44e-05 ***
## GarageYrBlt -1.531e+01 2.618e+00 -5.848 6.15e-09 ***
## GarageCars 1.758e+04 2.046e+03 8.594 < 2e-16 ***
## OpenPorchSF -5.666e+00 1.482e+01 -0.382 0.702278
## ScreenPorch 5.151e+01 1.654e+01 3.114 0.001884 **
## PoolArea -2.762e+01 2.330e+01 -1.185 0.236207
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34610 on 1439 degrees of freedom
## Multiple R-squared: 0.8128, Adjusted R-squared: 0.8103
## F-statistic: 328.8 on 19 and 1439 DF, p-value: < 2.2e-16
# Read the sample_submission file
sample_submission <- read.csv("https://raw.githubusercontent.com/joewarner89/DATA-605-Computational-Mathematics/main/project/sample_submission.csv")
# Create a new data frame with only "Id" column
predictions_df <- data.frame(ID = sample_submission$Id)
# Predict the SalePrice using your regression model (replace `model` with your actual model)
predictions_df$SalePrice <- predict(model, newdata = regression_data)
res <- resid(model)
#produce residual vs. fitted plot
plot(fitted(model), res)
#add a horizontal line at 0
abline(0,0)
#create Q-Q plot for residuals
qqnorm(res)
#add a straight diagonal line to the plot
qqline(res)
# Write the predictions to a CSV file
write.csv(predictions_df, file = "predictions.csv", row.names = FALSE)
# Verify the number of rows in the predictions file
num_rows <- nrow(predictions_df)
print(num_rows) # Should be 1459
## [1] 1459