library(readr)
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
hpricing<-read_xlsx("C:\\Users\\Administrator\\Downloads\\Datasets\\hpricing.xlsx")
hpricing%>%glimpse()
## Rows: 1,460
## Columns: 26
## $ HouseId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ MSZoning <chr> "Residential Low Density", "Residential Low D…
## $ LotAreaSquareFeet <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084,…
## $ LandSlope <chr> "Gentleslope", "Gentleslope", "Gentleslope", …
## $ BuildingType <chr> "Single-family Detached", "Single-family Deta…
## $ OverallCondition <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, …
## $ YearBuilt <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 197…
## $ ExteriorCondition <chr> "Average/Typical", "Average/Typical", "Averag…
## $ Foundation <chr> "Poured Contrete", "Cinder Block", "Poured Co…
## $ TotalBasementSquareFeet <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 9…
## $ HeatingQualityCondition <chr> "Excellent", "Excellent", "Excellent", "Good"…
## $ CentralAirConditioning <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ `1stFloorSquareFeet` <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1…
## $ `2ndFlrSquareFeet` <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, …
## $ LivAreaSquareFeet <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 209…
## $ FullBathrooms <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, …
## $ Bedrooms <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, …
## $ KitchenQualityCondition <chr> "Good", "Typical/Average", "Good", "Good", "G…
## $ TotalRooms <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5,…
## $ GarageArea <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, …
## $ TotalPorchAreaSquareFeet <dbl> 61, 0, 42, 307, 84, 30, 57, 432, 205, 4, 0, 2…
## $ MonthSold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, …
## $ YearSold <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 200…
## $ SaleType <chr> "Warranty Deed - Conventional", "Warranty Dee…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Nor…
## $ SalePrice <dbl> 208500, 181500, 223500, 140000, 250000, 14300…
summary(hpricing)
## HouseId MSZoning LotAreaSquareFeet LandSlope
## Min. : 1.0 Length:1460 Min. : 1300 Length:1460
## 1st Qu.: 365.8 Class :character 1st Qu.: 7554 Class :character
## Median : 730.5 Mode :character Median : 9478 Mode :character
## Mean : 730.5 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 11602
## Max. :1460.0 Max. :215245
## BuildingType OverallCondition YearBuilt ExteriorCondition
## Length:1460 Min. :1.000 Min. :1872 Length:1460
## Class :character 1st Qu.:5.000 1st Qu.:1954 Class :character
## Mode :character Median :5.000 Median :1973 Mode :character
## Mean :5.575 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2000
## Max. :9.000 Max. :2010
## Foundation TotalBasementSquareFeet HeatingQualityCondition
## Length:1460 Min. : 0.0 Length:1460
## Class :character 1st Qu.: 795.8 Class :character
## Mode :character Median : 991.5 Mode :character
## Mean :1057.4
## 3rd Qu.:1298.2
## Max. :6110.0
## CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
## Length:1460 Min. : 334 Min. : 0 Min. : 334
## Class :character 1st Qu.: 882 1st Qu.: 0 1st Qu.:1130
## Mode :character Median :1087 Median : 0 Median :1464
## Mean :1163 Mean : 347 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :5642
## FullBathrooms Bedrooms KitchenQualityCondition TotalRooms
## Min. :0.000 Min. :0.000 Length:1460 Min. : 2.000
## 1st Qu.:1.000 1st Qu.:2.000 Class :character 1st Qu.: 5.000
## Median :2.000 Median :3.000 Mode :character Median : 6.000
## Mean :1.565 Mean :2.866 Mean : 6.518
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 7.000
## Max. :3.000 Max. :8.000 Max. :14.000
## GarageArea TotalPorchAreaSquareFeet MonthSold YearSold
## Min. : 0.0 Min. : 0.00 Min. : 1.000 Min. :2006
## 1st Qu.: 334.5 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007
## Median : 480.0 Median : 40.00 Median : 6.000 Median :2008
## Mean : 473.0 Mean : 68.61 Mean : 6.322 Mean :2008
## 3rd Qu.: 576.0 3rd Qu.:104.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :1418.0 Max. :638.00 Max. :12.000 Max. :2010
## SaleType SaleCondition SalePrice
## Length:1460 Length:1460 Min. : 34900
## Class :character Class :character 1st Qu.:129975
## Mode :character Mode :character Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
dim(hpricing)
## [1] 1460 26
sum(is.na(hpricing))
## [1] 0
skim(hpricing)
| Name | hpricing |
| Number of rows | 1460 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1 | 10 | 28 | 0 | 5 | 0 |
| LandSlope | 0 | 1 | 11 | 13 | 0 | 3 | 0 |
| BuildingType | 0 | 1 | 6 | 62 | 0 | 5 | 0 |
| ExteriorCondition | 0 | 1 | 4 | 15 | 0 | 5 | 0 |
| Foundation | 0 | 1 | 4 | 15 | 0 | 6 | 0 |
| HeatingQualityCondition | 0 | 1 | 4 | 15 | 0 | 5 | 0 |
| CentralAirConditioning | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| KitchenQualityCondition | 0 | 1 | 4 | 15 | 0 | 4 | 0 |
| SaleType | 0 | 1 | 5 | 42 | 0 | 9 | 0 |
| SaleCondition | 0 | 1 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| HouseId | 0 | 1 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| LotAreaSquareFeet | 0 | 1 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| OverallCondition | 0 | 1 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| TotalBasementSquareFeet | 0 | 1 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| 1stFloorSquareFeet | 0 | 1 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| 2ndFlrSquareFeet | 0 | 1 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| LivAreaSquareFeet | 0 | 1 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| FullBathrooms | 0 | 1 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| Bedrooms | 0 | 1 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| TotalRooms | 0 | 1 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| GarageArea | 0 | 1 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| TotalPorchAreaSquareFeet | 0 | 1 | 68.61 | 85.86 | 0 | 0.00 | 40.0 | 104.00 | 638 | ▇▂▁▁▁ |
| MonthSold | 0 | 1 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| YearSold | 0 | 1 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
sum(duplicated(hpricing))
## [1] 0
hpricing_model4<-lm(SalePrice~GarageArea+TotalBasementSquareFeet+LotAreaSquareFeet+LivAreaSquareFeet+TotalPorchAreaSquareFeet+TotalRooms+FullBathrooms+OverallCondition, data=hpricing)
summary(hpricing_model4)
##
## Call:
## lm(formula = SalePrice ~ GarageArea + TotalBasementSquareFeet +
## LotAreaSquareFeet + LivAreaSquareFeet + TotalPorchAreaSquareFeet +
## TotalRooms + FullBathrooms + OverallCondition, data = hpricing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623128 -19500 -1009 17110 286592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.517e+04 8.887e+03 -6.208 7.01e-10 ***
## GarageArea 9.705e+01 6.784e+00 14.306 < 2e-16 ***
## TotalBasementSquareFeet 4.887e+01 3.329e+00 14.678 < 2e-16 ***
## LotAreaSquareFeet 2.307e-01 1.249e-01 1.846 0.0650 .
## LivAreaSquareFeet 6.480e+01 4.800e+00 13.500 < 2e-16 ***
## TotalPorchAreaSquareFeet -2.582e+01 1.434e+01 -1.801 0.0720 .
## TotalRooms -3.032e+03 1.320e+03 -2.297 0.0218 *
## FullBathrooms 2.084e+04 2.864e+03 7.274 5.67e-13 ***
## OverallCondition 4.811e+03 1.100e+03 4.372 1.32e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45190 on 1451 degrees of freedom
## Multiple R-squared: 0.6782, Adjusted R-squared: 0.6764
## F-statistic: 382.2 on 8 and 1451 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(hpricing_model4)
hpricing%>%
select(where(is.numeric))%>%
map(~boxplot.stats(.)$out)
## $HouseId
## numeric(0)
##
## $LotAreaSquareFeet
## [1] 50271 19900 21000 21453 19378 31770 22950 25419 159000 19296
## [11] 39104 19138 18386 215245 164660 20431 18800 53107 34650 22420
## [21] 21750 70761 53227 40094 32668 21872 21780 25095 46589 20896
## [31] 18450 21535 26178 115149 21695 53504 21384 28698 45600 17920
## [41] 25286 27650 24090 25000 1300 21286 21750 29959 18000 23257
## [51] 17755 35760 18030 35133 32463 18890 24682 23595 17871 36500
## [61] 63887 20781 25339 57200 20544 19690 21930 26142
##
## $OverallCondition
## [1] 8 8 8 2 3 8 8 8 8 9 8 9 9 3 9 8 2 8 8 9 8 3 8 8 8 1 2 3 3 8 2 8 3 8 8 8 8
## [38] 9 8 8 8 8 9 9 3 8 3 8 9 8 8 8 3 3 3 2 8 8 9 8 8 9 8 9 3 8 8 8 8 8 8 8 8 8
## [75] 8 3 8 3 8 8 8 9 8 9 8 3 8 3 8 3 3 8 3 8 8 3 9 3 8 3 9 8 8 8 8 8 8 8 9 3 8
## [112] 8 8 8 9 8 3 9 9 3 8 8 8 9 9
##
## $YearBuilt
## [1] 1880 1880 1880 1882 1880 1875 1872
##
## $TotalBasementSquareFeet
## [1] 0 0 2223 0 0 0 2216 0 2392 0 2121 2136 3206 0 0
## [16] 0 0 3094 2153 3200 0 3138 0 0 0 0 2109 2077 2444 0
## [31] 0 0 0 2078 0 2217 0 0 2330 0 0 0 0 2524 0
## [46] 0 0 0 0 2396 2158 0 0 2136 0 2076 2110 6110 0 2633
## [61] 0
##
## $`1stFloorSquareFeet`
## [1] 2207 2223 2259 2158 2234 2392 2402 3228 3138 2515 2444 2217 2364 2898 2524
## [16] 2411 2196 4692 2156 2633
##
## $`2ndFlrSquareFeet`
## [1] 1872 2065
##
## $LivAreaSquareFeet
## [1] 2945 3222 3608 3112 2794 3493 2978 3228 4676 2775 3194 3395 4316 3279 3140
## [16] 2822 2872 2898 3082 2868 2828 3627 3086 2872 4476 3447 5642 2810 2792 3238
## [31] 2784
##
## $FullBathrooms
## numeric(0)
##
## $Bedrooms
## [1] 0 5 5 6 0 5 6 5 5 6 5 6 5 0 8 5 6 5 5 6 5 5 5 5 5 5 5 0 0 5 0 5 6 5 5
##
## $TotalRooms
## [1] 11 11 12 11 11 11 11 14 11 12 11 12 11 11 12 11 12 11 12 11 11 12 12 11 11
## [26] 12 12 12 11 11
##
## $GarageArea
## [1] 1166 968 1053 1025 947 1390 1134 983 1020 1220 1248 1043 1052 995 1356
## [16] 1052 954 1014 1418 968 1069
##
## $TotalPorchAreaSquareFeet
## [1] 307 432 389 312 276 568 269 348 268 328 294 346 269 638 366 285 324 406 308
## [20] 502 274 293 352 340 365 267 265 288 370 262 341 386 425 264 299 312 275 346
## [39] 264 418 364 416 318 282 319 280 263 306 304 330 264 287 292 369 286 547 301
## [58] 262 289 316 304
##
## $MonthSold
## numeric(0)
##
## $YearSold
## numeric(0)
##
## $SalePrice
## [1] 345000 385000 438780 383970 372402 412500 501837 475000 386250 403000
## [11] 415298 360000 375000 342643 354000 377426 437154 394432 426000 555000
## [21] 440000 380000 374000 430000 402861 446261 369900 451950 359100 345000
## [31] 370878 350000 402000 423000 372500 392000 755000 361919 341000 538000
## [41] 395000 485000 582933 385000 350000 611657 395192 348000 556581 424870
## [51] 625000 392500 745000 367294 465000 378500 381000 410000 466500 377500
## [61] 394617
ggplot(data=hpricing, aes(y=SalePrice, x=GarageArea))+
geom_point()+
labs(y="SALE_PRICE", x="GARAGE_AREA")
ggplot(data=hpricing, aes(y=SalePrice, x=TotalBasementSquareFeet))+
geom_point()+
labs(y="SALE_PRICE", x="TOTAL_BASEMENT_SQUAREFEET")
ggplot(data=hpricing, aes(y=SalePrice, x=LotAreaSquareFeet))+
geom_point()+
labs(y="SALE_PRICE", x="LOT_AREA_SQUAREFEET")
ggplot(data=hpricing, aes(y=SalePrice, x=LivAreaSquareFeet))+
geom_point()+
labs(y="SALE_PRICE", x="LIV_AREA_SQUAREFEET")
ggplot(data=hpricing, aes(y=SalePrice, x=TotalPorchAreaSquareFeet))+
geom_point()+
labs(y="SALE_PRICE", x="TOTAL_PORCH_AREA_SQUAREFEET")
ggplot(data=hpricing, aes(y=SalePrice, x=FullBathrooms))+
geom_point()+
labs(y="SALE_PRICE", x="FULL_BATHROOMS")
ggplot(data=hpricing, aes(y=SalePrice, x=OverallCondition))+
geom_point()+
labs(y="SALE_PRICE", x="OVERALL_CONDITION")
qqnorm(hpricing$SalePrice)
qqline(hpricing$SalePrice,col="red")
### This suggests that house prices have a long tail on the higher end,
meaning a few houses are priced significantly higher than the majority.
Additionally, the curvature at the ends confirms the presence of
outliers and non-normality in the data.
shapiro.test(hpricing$YearBuilt)
##
## Shapiro-Wilk normality test
##
## data: hpricing$YearBuilt
## W = 0.9256, p-value < 2.2e-16
ks.test(hpricing$SalePrice,"pnorm")
## Warning in ks.test.default(hpricing$SalePrice, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: hpricing$SalePrice
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
t.test(hpricing$SalePrice)
##
## One Sample t-test
##
## data: hpricing$SalePrice
## t = 87.019, df = 1459, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 176842.8 184999.6
## sample estimates:
## mean of x
## 180921.2
mean(hpricing$SalePrice)
## [1] 180921.2
median(hpricing$SalePrice)
## [1] 163000
##Since the mean is higher than the median, it suggests that some very expensive houses are pulling up the average.
hist(hpricing$SalePrice,
breaks = 30,
col = "blue",
main = "Distribution of House Sale Prices",
xlab = "Sale Price",
border = "white")
### a few houses are sold above 163000