library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.95 loaded
House_pricing_data <- read_excel("/Users/cheptoo/Downloads/House pricing data.xlsx")
View(House_pricing_data)
checking the columns
colnames(House_pricing_data)
## [1] "HouseId" "MSZoning"
## [3] "LotAreaSquareFeet" "LandSlope"
## [5] "BuildingType" "OverallCondition"
## [7] "YearBuilt" "ExteriorCondition"
## [9] "Foundation" "TotalBasementSquareFeet"
## [11] "HeatingQualityCondition" "CentralAirConditioning"
## [13] "1stFloorSquareFeet" "2ndFlrSquareFeet"
## [15] "LivAreaSquareFeet" "FullBathrooms"
## [17] "Bedrooms" "KitchenQualityCondition"
## [19] "TotalRooms" "GarageArea"
## [21] "TotalPorchAreaSquareFeet" "MonthSold"
## [23] "YearSold" "SaleType"
## [25] "SaleCondition" "SalePrice"
To understand the structure of the data, distribution, identify trends and relationship between the variables
head(House_pricing_data)
## # A tibble: 6 × 26
## HouseId MSZoning LotAreaSquareFeet LandSlope BuildingType OverallCondition
## <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 1 Residential… 8450 Gentlesl… Single-fami… 5
## 2 2 Residential… 9600 Gentlesl… Single-fami… 8
## 3 3 Residential… 11250 Gentlesl… Single-fami… 5
## 4 4 Residential… 9550 Gentlesl… Single-fami… 5
## 5 5 Residential… 14260 Gentlesl… Single-fami… 5
## 6 6 Residential… 14115 Gentlesl… Single-fami… 5
## # ℹ 20 more variables: YearBuilt <dbl>, ExteriorCondition <chr>,
## # Foundation <chr>, TotalBasementSquareFeet <dbl>,
## # HeatingQualityCondition <chr>, CentralAirConditioning <chr>,
## # `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## # LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## # KitchenQualityCondition <chr>, TotalRooms <dbl>, GarageArea <dbl>,
## # TotalPorchAreaSquareFeet <dbl>, MonthSold <dbl>, YearSold <dbl>, …
tail(House_pricing_data)
## # A tibble: 6 × 26
## HouseId MSZoning LotAreaSquareFeet LandSlope BuildingType OverallCondition
## <dbl> <chr> <dbl> <chr> <chr> <dbl>
## 1 1455 Floating Vi… 7500 Gentlesl… Single-fami… 5
## 2 1456 Residential… 7917 Gentlesl… Single-fami… 5
## 3 1457 Residential… 13175 Gentlesl… Single-fami… 6
## 4 1458 Residential… 9042 Gentlesl… Single-fami… 9
## 5 1459 Residential… 9717 Gentlesl… Single-fami… 6
## 6 1460 Residential… 9937 Gentlesl… Single-fami… 6
## # ℹ 20 more variables: YearBuilt <dbl>, ExteriorCondition <chr>,
## # Foundation <chr>, TotalBasementSquareFeet <dbl>,
## # HeatingQualityCondition <chr>, CentralAirConditioning <chr>,
## # `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## # LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## # KitchenQualityCondition <chr>, TotalRooms <dbl>, GarageArea <dbl>,
## # TotalPorchAreaSquareFeet <dbl>, MonthSold <dbl>, YearSold <dbl>, …
glimpse(House_pricing_data)
## Rows: 1,460
## Columns: 26
## $ HouseId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ MSZoning <chr> "Residential Low Density", "Residential Low D…
## $ LotAreaSquareFeet <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084,…
## $ LandSlope <chr> "Gentleslope", "Gentleslope", "Gentleslope", …
## $ BuildingType <chr> "Single-family Detached", "Single-family Deta…
## $ OverallCondition <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, …
## $ YearBuilt <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 197…
## $ ExteriorCondition <chr> "Average/Typical", "Average/Typical", "Averag…
## $ Foundation <chr> "Poured Contrete", "Cinder Block", "Poured Co…
## $ TotalBasementSquareFeet <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 9…
## $ HeatingQualityCondition <chr> "Excellent", "Excellent", "Excellent", "Good"…
## $ CentralAirConditioning <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ `1stFloorSquareFeet` <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1…
## $ `2ndFlrSquareFeet` <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, …
## $ LivAreaSquareFeet <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 209…
## $ FullBathrooms <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, …
## $ Bedrooms <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, …
## $ KitchenQualityCondition <chr> "Good", "Typical/Average", "Good", "Good", "G…
## $ TotalRooms <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5,…
## $ GarageArea <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, …
## $ TotalPorchAreaSquareFeet <dbl> 61, 0, 42, 307, 84, 30, 57, 432, 205, 4, 0, 2…
## $ MonthSold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, …
## $ YearSold <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 200…
## $ SaleType <chr> "Warranty Deed - Conventional", "Warranty Dee…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Nor…
## $ SalePrice <dbl> 208500, 181500, 223500, 140000, 250000, 14300…
str(House_pricing_data)
## tibble [1,460 × 26] (S3: tbl_df/tbl/data.frame)
## $ HouseId : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
## $ MSZoning : chr [1:1460] "Residential Low Density" "Residential Low Density" "Residential Low Density" "Residential Low Density" ...
## $ LotAreaSquareFeet : num [1:1460] 8450 9600 11250 9550 14260 ...
## $ LandSlope : chr [1:1460] "Gentleslope" "Gentleslope" "Gentleslope" "Gentleslope" ...
## $ BuildingType : chr [1:1460] "Single-family Detached" "Single-family Detached" "Single-family Detached" "Single-family Detached" ...
## $ OverallCondition : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : num [1:1460] 2003 1976 2001 1915 2000 ...
## $ ExteriorCondition : chr [1:1460] "Average/Typical" "Average/Typical" "Average/Typical" "Average/Typical" ...
## $ Foundation : chr [1:1460] "Poured Contrete" "Cinder Block" "Poured Contrete" "Brick & Tile" ...
## $ TotalBasementSquareFeet : num [1:1460] 856 1262 920 756 1145 ...
## $ HeatingQualityCondition : chr [1:1460] "Excellent" "Excellent" "Excellent" "Good" ...
## $ CentralAirConditioning : chr [1:1460] "Yes" "Yes" "Yes" "Yes" ...
## $ 1stFloorSquareFeet : num [1:1460] 856 1262 920 961 1145 ...
## $ 2ndFlrSquareFeet : num [1:1460] 854 0 866 756 1053 ...
## $ LivAreaSquareFeet : num [1:1460] 1710 1262 1786 1717 2198 ...
## $ FullBathrooms : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
## $ Bedrooms : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenQualityCondition : chr [1:1460] "Good" "Typical/Average" "Good" "Good" ...
## $ TotalRooms : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
## $ GarageArea : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
## $ TotalPorchAreaSquareFeet: num [1:1460] 61 0 42 307 84 30 57 432 205 4 ...
## $ MonthSold : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
## $ YearSold : num [1:1460] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1460] "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" ...
## $ SaleCondition : chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1460] 208500 181500 223500 140000 250000 ...
summary(House_pricing_data)
## HouseId MSZoning LotAreaSquareFeet LandSlope
## Min. : 1.0 Length:1460 Min. : 1300 Length:1460
## 1st Qu.: 365.8 Class :character 1st Qu.: 7554 Class :character
## Median : 730.5 Mode :character Median : 9478 Mode :character
## Mean : 730.5 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 11602
## Max. :1460.0 Max. :215245
## BuildingType OverallCondition YearBuilt ExteriorCondition
## Length:1460 Min. :1.000 Min. :1872 Length:1460
## Class :character 1st Qu.:5.000 1st Qu.:1954 Class :character
## Mode :character Median :5.000 Median :1973 Mode :character
## Mean :5.575 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2000
## Max. :9.000 Max. :2010
## Foundation TotalBasementSquareFeet HeatingQualityCondition
## Length:1460 Min. : 0.0 Length:1460
## Class :character 1st Qu.: 795.8 Class :character
## Mode :character Median : 991.5 Mode :character
## Mean :1057.4
## 3rd Qu.:1298.2
## Max. :6110.0
## CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
## Length:1460 Min. : 334 Min. : 0 Min. : 334
## Class :character 1st Qu.: 882 1st Qu.: 0 1st Qu.:1130
## Mode :character Median :1087 Median : 0 Median :1464
## Mean :1163 Mean : 347 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :5642
## FullBathrooms Bedrooms KitchenQualityCondition TotalRooms
## Min. :0.000 Min. :0.000 Length:1460 Min. : 2.000
## 1st Qu.:1.000 1st Qu.:2.000 Class :character 1st Qu.: 5.000
## Median :2.000 Median :3.000 Mode :character Median : 6.000
## Mean :1.565 Mean :2.866 Mean : 6.518
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 7.000
## Max. :3.000 Max. :8.000 Max. :14.000
## GarageArea TotalPorchAreaSquareFeet MonthSold YearSold
## Min. : 0.0 Min. : 0.00 Min. : 1.000 Min. :2006
## 1st Qu.: 334.5 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007
## Median : 480.0 Median : 40.00 Median : 6.000 Median :2008
## Mean : 473.0 Mean : 68.61 Mean : 6.322 Mean :2008
## 3rd Qu.: 576.0 3rd Qu.:104.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :1418.0 Max. :638.00 Max. :12.000 Max. :2010
## SaleType SaleCondition SalePrice
## Length:1460 Length:1460 Min. : 34900
## Class :character Class :character 1st Qu.:129975
## Mode :character Mode :character Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
library(skimr) # More detailed summary using skimr
skim(House_pricing_data)
| Name | House_pricing_data |
| Number of rows | 1460 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| MSZoning | 0 | 1 | 10 | 28 | 0 | 5 | 0 |
| LandSlope | 0 | 1 | 11 | 13 | 0 | 3 | 0 |
| BuildingType | 0 | 1 | 6 | 62 | 0 | 5 | 0 |
| ExteriorCondition | 0 | 1 | 4 | 15 | 0 | 5 | 0 |
| Foundation | 0 | 1 | 4 | 15 | 0 | 6 | 0 |
| HeatingQualityCondition | 0 | 1 | 4 | 15 | 0 | 5 | 0 |
| CentralAirConditioning | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| KitchenQualityCondition | 0 | 1 | 4 | 15 | 0 | 4 | 0 |
| SaleType | 0 | 1 | 5 | 42 | 0 | 9 | 0 |
| SaleCondition | 0 | 1 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| HouseId | 0 | 1 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| LotAreaSquareFeet | 0 | 1 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| OverallCondition | 0 | 1 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| YearBuilt | 0 | 1 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| TotalBasementSquareFeet | 0 | 1 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| 1stFloorSquareFeet | 0 | 1 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| 2ndFlrSquareFeet | 0 | 1 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| LivAreaSquareFeet | 0 | 1 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| FullBathrooms | 0 | 1 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| Bedrooms | 0 | 1 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| TotalRooms | 0 | 1 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| GarageArea | 0 | 1 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| TotalPorchAreaSquareFeet | 0 | 1 | 68.61 | 85.86 | 0 | 0.00 | 40.0 | 104.00 | 638 | ▇▂▁▁▁ |
| MonthSold | 0 | 1 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| YearSold | 0 | 1 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| SalePrice | 0 | 1 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
sum(duplicated(House_pricing_data))
## [1] 0
There are no duplicates.
sum(is.na(House_pricing_data))
## [1] 0
There are no missing values
# Separate numerical columns
numerical_cols <- House_pricing_data %>%
select_if(is.numeric)
# Drop the HouseId column from the numerical_cols dataset
numerical_cols <- numerical_cols %>%
select(-HouseId)
colnames(numerical_cols)
## [1] "LotAreaSquareFeet" "OverallCondition"
## [3] "YearBuilt" "TotalBasementSquareFeet"
## [5] "1stFloorSquareFeet" "2ndFlrSquareFeet"
## [7] "LivAreaSquareFeet" "FullBathrooms"
## [9] "Bedrooms" "TotalRooms"
## [11] "GarageArea" "TotalPorchAreaSquareFeet"
## [13] "MonthSold" "YearSold"
## [15] "SalePrice"
head(numerical_cols)
## # A tibble: 6 × 15
## LotAreaSquareFeet OverallCondition YearBuilt TotalBasementSquareFeet
## <dbl> <dbl> <dbl> <dbl>
## 1 8450 5 2003 856
## 2 9600 8 1976 1262
## 3 11250 5 2001 920
## 4 9550 5 1915 756
## 5 14260 5 2000 1145
## 6 14115 5 1993 796
## # ℹ 11 more variables: `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## # LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## # TotalRooms <dbl>, GarageArea <dbl>, TotalPorchAreaSquareFeet <dbl>,
## # MonthSold <dbl>, YearSold <dbl>, SalePrice <dbl>
# Separate categorical columns
categorical_cols <- House_pricing_data %>%
select_if(function(x) is.factor(x) | is.character(x))
colnames(categorical_cols)
## [1] "MSZoning" "LandSlope"
## [3] "BuildingType" "ExteriorCondition"
## [5] "Foundation" "HeatingQualityCondition"
## [7] "CentralAirConditioning" "KitchenQualityCondition"
## [9] "SaleType" "SaleCondition"
# Create individual histograms for each numerical column
p1 <- ggplot(House_pricing_data, aes(x = SalePrice)) +
geom_histogram(fill = "blue", bins = 30, alpha = 0.5) +
ggtitle("SalePrice Distribution")
p2 <- ggplot(House_pricing_data, aes(x = TotalBasementSquareFeet)) +
geom_histogram(fill = "green", bins = 30, alpha = 0.5) +
ggtitle("TotalBasementSquareFeet Distribution")
grid.arrange(p1, p2)
Shape of the Distribution: The histogram appears to have a right-skewed distribution. Peak: There is a peak at lower sale prices, suggesting that most properties have a lower sale price. This could be typical of a market where there are many affordable properties.
Tail to the Right: There is a long tail extending toward higher sale prices. This indicates that a smaller number of properties have significantly higher sale prices.
Outliers: The tail also suggests the presence of outliers—properties with very high sale prices, which could be luxury homes or properties in very high-demand locations.
This distribution indicates a typical “few expensive properties, many affordable ones” scenario, which is often observed in housing markets.
Shape of the Distribution: The histogram shows a skewed distribution that is left-skewed.
Peak near 0: The majority of homes have smaller basements, with many properties having no basement or a small basement. This is seen in the peak near the left side of the histogram (around 0 square feet).
Gradual decline: As the size of the basement increases, the frequency decreases, indicating that fewer homes have very large basements.
Small Tail on the Right: There are some homes with very large basements, but these properties are relatively rare. The right tail is smaller, suggesting that these larger basement sizes are outliers.
p3 <- ggplot(House_pricing_data, aes(x = LotAreaSquareFeet)) +
geom_histogram(fill = "red", bins = 30, alpha = 0.5) +
ggtitle("LotAreaSquareFeet Distribution")
p4 <- ggplot(House_pricing_data, aes(x = GarageArea)) +
geom_histogram(fill = "purple", bins = 30, alpha = 0.5) +
ggtitle("GarageArea Distribution")
grid.arrange(p3, p4)
p5 <- ggplot(House_pricing_data, aes(x = LivAreaSquareFeet)) +
geom_histogram(fill = "orange", bins = 30, alpha = 0.5) +
ggtitle("LivAreaSquareFeet Distribution")
p6 <- ggplot(House_pricing_data, aes(x = `1stFloorSquareFeet`)) +
geom_histogram(fill = "yellow", bins = 30, alpha = 0.5) +
ggtitle("1stFloorSquareFeet Distribution")
grid.arrange(p5, p6)
p7 <- ggplot(House_pricing_data, aes(x = OverallCondition)) +
geom_histogram(fill = "pink", bins = 30, alpha = 0.5) +
ggtitle("OverallCondition Distribution")
p8 <- ggplot(House_pricing_data, aes(x = `2ndFlrSquareFeet`)) +
geom_histogram(fill = "brown", bins = 30, alpha = 0.5) +
ggtitle("2ndFlrSquareFeet Distribution")
grid.arrange(p7, p8)
p9 <- ggplot(House_pricing_data, aes(x = TotalPorchAreaSquareFeet)) +
geom_histogram(fill = "cyan", bins = 30, alpha = 0.5) +
ggtitle("TotalPorchAreaSquareFeet Distribution")
p10 <- ggplot(House_pricing_data, aes(x = TotalRooms)) +
geom_histogram(fill = "green", bins = 30, alpha = 0.5) +
ggtitle("TotalRooms Distribution")
p11 <- ggplot(House_pricing_data, aes(x = FullBathrooms)) +
geom_histogram(fill = "blue", bins = 30, alpha = 0.5) +
ggtitle("FullBathrooms Distribution")
grid.arrange(p9, p10, p11 )
# Create count plots for each categorical variable
c1 <- ggplot(House_pricing_data, aes(x = MSZoning, fill = MSZoning)) +
geom_bar() +
ggtitle("MSZoning Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c2 <- ggplot(House_pricing_data, aes(x = LandSlope, fill = LandSlope)) +
geom_bar() +
ggtitle("LandSlope Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
grid.arrange(c1, c2)
c3 <- ggplot(House_pricing_data, aes(x = BuildingType, fill = BuildingType)) +
geom_bar() +
ggtitle("BuildingType Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c3
c4 <- ggplot(House_pricing_data, aes(x = ExteriorCondition, fill = ExteriorCondition)) +
geom_bar() +
ggtitle("ExteriorCondition Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c4
c5 <- ggplot(House_pricing_data, aes(x = Foundation, fill = Foundation)) +
geom_bar() +
ggtitle("Foundation Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c6 <- ggplot(House_pricing_data, aes(x = HeatingQualityCondition, fill = HeatingQualityCondition)) +
geom_bar() +
ggtitle("HeatingQualityCondition Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
grid.arrange(c5, c6)
c7 <- ggplot(House_pricing_data, aes(x = CentralAirConditioning, fill = CentralAirConditioning)) +
geom_bar() +
ggtitle("CentralAirConditioning Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c8 <- ggplot(House_pricing_data, aes(x = KitchenQualityCondition, fill = KitchenQualityCondition)) +
geom_bar() +
ggtitle("KitchenQualityCondition Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
grid.arrange(c7, c8)
c9 <- ggplot(House_pricing_data, aes(x = SaleType, fill = SaleType)) +
geom_bar() +
ggtitle("SaleType Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c9
c10 <- ggplot(House_pricing_data, aes(x = SaleCondition, fill = SaleCondition)) +
geom_bar() +
ggtitle("SaleCondition Distribution") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c10
# Calculate the correlation matrix
cor_matrix <- cor(numerical_cols)
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
number.cex = 0.7)
# Create a new column for the decade based on YearBuilt
House_pricing_data$Decade <- floor(House_pricing_data$YearBuilt / 10) * 10
# Calculate average SalePrice by Decade
average_saleprice_by_decade <- House_pricing_data %>%
group_by(Decade) %>%
summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))
# Line chart for average SalePrice by Decade
ggplot(average_saleprice_by_decade, aes(x = factor(Decade), y = average_saleprice)) +
geom_line(group = 1, color = "blue", linewidth = 1) + # Line
geom_point(color = "red", size = 2) + # Points
ggtitle("Average SalePrice by Decade of YearBuilt") +
xlab("Decade Built") +
ylab("Average Sale Price") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Calculate average SalePrice by Number of Bedrooms
avg_saleprice_by_bedrooms <- House_pricing_data %>%
group_by(Bedrooms) %>%
summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))
# Bar chart for SalePrice by Number of Bedrooms
ggplot(avg_saleprice_by_bedrooms, aes(x = factor(Bedrooms), y = average_saleprice)) +
geom_bar(stat = "identity", fill = "lightblue", color = "black") +
ggtitle("Average SalePrice by Number of Bedrooms") +
xlab("Number of Bedrooms") +
ylab("Average Sale Price") +
theme_minimal()
# Calculate average SalePrice by Number of FullBathrooms
avg_saleprice_by_fullbathrooms <- House_pricing_data %>%
group_by(FullBathrooms) %>%
summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))
# Bar chart for SalePrice by Number of FullBathrooms
ggplot(avg_saleprice_by_fullbathrooms, aes(x = factor(FullBathrooms), y = average_saleprice)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "black") +
ggtitle("Average SalePrice by Number of FullBathrooms") +
xlab("Number of FullBathrooms") +
ylab("Average Sale Price") +
theme_minimal()
# Calculate average SalePrice by TotalRooms
avg_saleprice_by_totalrooms <- House_pricing_data %>%
group_by(TotalRooms) %>%
summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))
# Bar chart for SalePrice by TotalRooms
ggplot(avg_saleprice_by_totalrooms, aes(x = factor(TotalRooms), y = average_saleprice)) +
geom_bar(stat = "identity", fill = "lightblue", color = "black") +
ggtitle("Average SalePrice by TotalRooms") +
xlab("Number of Total Rooms") +
ylab("Average Sale Price") +
theme_minimal()
We can do feature engineering to get total living area
# Create the TotalLivingArea feature by summing LivAreaSquareFeet and TotalBasementSquareFeet
House_pricing_data$TotalLivingArea <- House_pricing_data$LivAreaSquareFeet + House_pricing_data$TotalBasementSquareFeet
# Check the new feature by viewing the first few rows of the data
head(House_pricing_data$TotalLivingArea)
## [1] 2566 2524 2706 2473 3343 2158
# Scatter plot to explore the relationship between TotalLivingArea and SalePrice
ggplot(House_pricing_data, aes(x = TotalLivingArea, y = SalePrice)) +
geom_point(color = "blue", alpha = 0.5) +
ggtitle("TotalLivingArea vs SalePrice") +
xlab("Total Living Area (sq ft)") +
ylab("Sale Price") +
theme_minimal()