GROUP MEMBERS

1. Mercy Cheptoo 21/05823

2. Edwin Mucheru 21/06264

3. Celine Salesa 21/08374

4. JoyComfort Wangari 21/05738

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.95 loaded

IMPORTING THE DATASET

House_pricing_data <- read_excel("/Users/cheptoo/Downloads/House pricing data.xlsx")
View(House_pricing_data)   

checking the columns

colnames(House_pricing_data)
##  [1] "HouseId"                  "MSZoning"                
##  [3] "LotAreaSquareFeet"        "LandSlope"               
##  [5] "BuildingType"             "OverallCondition"        
##  [7] "YearBuilt"                "ExteriorCondition"       
##  [9] "Foundation"               "TotalBasementSquareFeet" 
## [11] "HeatingQualityCondition"  "CentralAirConditioning"  
## [13] "1stFloorSquareFeet"       "2ndFlrSquareFeet"        
## [15] "LivAreaSquareFeet"        "FullBathrooms"           
## [17] "Bedrooms"                 "KitchenQualityCondition" 
## [19] "TotalRooms"               "GarageArea"              
## [21] "TotalPorchAreaSquareFeet" "MonthSold"               
## [23] "YearSold"                 "SaleType"                
## [25] "SaleCondition"            "SalePrice"

EDA

To understand the structure of the data, distribution, identify trends and relationship between the variables

head(House_pricing_data)
## # A tibble: 6 × 26
##   HouseId MSZoning     LotAreaSquareFeet LandSlope BuildingType OverallCondition
##     <dbl> <chr>                    <dbl> <chr>     <chr>                   <dbl>
## 1       1 Residential…              8450 Gentlesl… Single-fami…                5
## 2       2 Residential…              9600 Gentlesl… Single-fami…                8
## 3       3 Residential…             11250 Gentlesl… Single-fami…                5
## 4       4 Residential…              9550 Gentlesl… Single-fami…                5
## 5       5 Residential…             14260 Gentlesl… Single-fami…                5
## 6       6 Residential…             14115 Gentlesl… Single-fami…                5
## # ℹ 20 more variables: YearBuilt <dbl>, ExteriorCondition <chr>,
## #   Foundation <chr>, TotalBasementSquareFeet <dbl>,
## #   HeatingQualityCondition <chr>, CentralAirConditioning <chr>,
## #   `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## #   LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## #   KitchenQualityCondition <chr>, TotalRooms <dbl>, GarageArea <dbl>,
## #   TotalPorchAreaSquareFeet <dbl>, MonthSold <dbl>, YearSold <dbl>, …
tail(House_pricing_data)
## # A tibble: 6 × 26
##   HouseId MSZoning     LotAreaSquareFeet LandSlope BuildingType OverallCondition
##     <dbl> <chr>                    <dbl> <chr>     <chr>                   <dbl>
## 1    1455 Floating Vi…              7500 Gentlesl… Single-fami…                5
## 2    1456 Residential…              7917 Gentlesl… Single-fami…                5
## 3    1457 Residential…             13175 Gentlesl… Single-fami…                6
## 4    1458 Residential…              9042 Gentlesl… Single-fami…                9
## 5    1459 Residential…              9717 Gentlesl… Single-fami…                6
## 6    1460 Residential…              9937 Gentlesl… Single-fami…                6
## # ℹ 20 more variables: YearBuilt <dbl>, ExteriorCondition <chr>,
## #   Foundation <chr>, TotalBasementSquareFeet <dbl>,
## #   HeatingQualityCondition <chr>, CentralAirConditioning <chr>,
## #   `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## #   LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## #   KitchenQualityCondition <chr>, TotalRooms <dbl>, GarageArea <dbl>,
## #   TotalPorchAreaSquareFeet <dbl>, MonthSold <dbl>, YearSold <dbl>, …
glimpse(House_pricing_data)
## Rows: 1,460
## Columns: 26
## $ HouseId                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ MSZoning                 <chr> "Residential Low Density", "Residential Low D…
## $ LotAreaSquareFeet        <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084,…
## $ LandSlope                <chr> "Gentleslope", "Gentleslope", "Gentleslope", …
## $ BuildingType             <chr> "Single-family Detached", "Single-family Deta…
## $ OverallCondition         <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, …
## $ YearBuilt                <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 197…
## $ ExteriorCondition        <chr> "Average/Typical", "Average/Typical", "Averag…
## $ Foundation               <chr> "Poured Contrete", "Cinder Block", "Poured Co…
## $ TotalBasementSquareFeet  <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 9…
## $ HeatingQualityCondition  <chr> "Excellent", "Excellent", "Excellent", "Good"…
## $ CentralAirConditioning   <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ `1stFloorSquareFeet`     <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1…
## $ `2ndFlrSquareFeet`       <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, …
## $ LivAreaSquareFeet        <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 209…
## $ FullBathrooms            <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, …
## $ Bedrooms                 <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, …
## $ KitchenQualityCondition  <chr> "Good", "Typical/Average", "Good", "Good", "G…
## $ TotalRooms               <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5,…
## $ GarageArea               <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, …
## $ TotalPorchAreaSquareFeet <dbl> 61, 0, 42, 307, 84, 30, 57, 432, 205, 4, 0, 2…
## $ MonthSold                <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, …
## $ YearSold                 <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 200…
## $ SaleType                 <chr> "Warranty Deed - Conventional", "Warranty Dee…
## $ SaleCondition            <chr> "Normal", "Normal", "Normal", "Abnorml", "Nor…
## $ SalePrice                <dbl> 208500, 181500, 223500, 140000, 250000, 14300…
str(House_pricing_data)
## tibble [1,460 × 26] (S3: tbl_df/tbl/data.frame)
##  $ HouseId                 : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ MSZoning                : chr [1:1460] "Residential Low Density" "Residential Low Density" "Residential Low Density" "Residential Low Density" ...
##  $ LotAreaSquareFeet       : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ LandSlope               : chr [1:1460] "Gentleslope" "Gentleslope" "Gentleslope" "Gentleslope" ...
##  $ BuildingType            : chr [1:1460] "Single-family Detached" "Single-family Detached" "Single-family Detached" "Single-family Detached" ...
##  $ OverallCondition        : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt               : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ ExteriorCondition       : chr [1:1460] "Average/Typical" "Average/Typical" "Average/Typical" "Average/Typical" ...
##  $ Foundation              : chr [1:1460] "Poured Contrete" "Cinder Block" "Poured Contrete" "Brick & Tile" ...
##  $ TotalBasementSquareFeet : num [1:1460] 856 1262 920 756 1145 ...
##  $ HeatingQualityCondition : chr [1:1460] "Excellent" "Excellent" "Excellent" "Good" ...
##  $ CentralAirConditioning  : chr [1:1460] "Yes" "Yes" "Yes" "Yes" ...
##  $ 1stFloorSquareFeet      : num [1:1460] 856 1262 920 961 1145 ...
##  $ 2ndFlrSquareFeet        : num [1:1460] 854 0 866 756 1053 ...
##  $ LivAreaSquareFeet       : num [1:1460] 1710 1262 1786 1717 2198 ...
##  $ FullBathrooms           : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ Bedrooms                : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenQualityCondition : chr [1:1460] "Good" "Typical/Average" "Good" "Good" ...
##  $ TotalRooms              : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
##  $ GarageArea              : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
##  $ TotalPorchAreaSquareFeet: num [1:1460] 61 0 42 307 84 30 57 432 205 4 ...
##  $ MonthSold               : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
##  $ YearSold                : num [1:1460] 2008 2007 2008 2006 2008 ...
##  $ SaleType                : chr [1:1460] "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" ...
##  $ SaleCondition           : chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice               : num [1:1460] 208500 181500 223500 140000 250000 ...
summary(House_pricing_data)
##     HouseId         MSZoning         LotAreaSquareFeet  LandSlope        
##  Min.   :   1.0   Length:1460        Min.   :  1300    Length:1460       
##  1st Qu.: 365.8   Class :character   1st Qu.:  7554    Class :character  
##  Median : 730.5   Mode  :character   Median :  9478    Mode  :character  
##  Mean   : 730.5                      Mean   : 10517                      
##  3rd Qu.:1095.2                      3rd Qu.: 11602                      
##  Max.   :1460.0                      Max.   :215245                      
##  BuildingType       OverallCondition   YearBuilt    ExteriorCondition 
##  Length:1460        Min.   :1.000    Min.   :1872   Length:1460       
##  Class :character   1st Qu.:5.000    1st Qu.:1954   Class :character  
##  Mode  :character   Median :5.000    Median :1973   Mode  :character  
##                     Mean   :5.575    Mean   :1971                     
##                     3rd Qu.:6.000    3rd Qu.:2000                     
##                     Max.   :9.000    Max.   :2010                     
##   Foundation        TotalBasementSquareFeet HeatingQualityCondition
##  Length:1460        Min.   :   0.0          Length:1460            
##  Class :character   1st Qu.: 795.8          Class :character       
##  Mode  :character   Median : 991.5          Mode  :character       
##                     Mean   :1057.4                                 
##                     3rd Qu.:1298.2                                 
##                     Max.   :6110.0                                 
##  CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
##  Length:1460            Min.   : 334       Min.   :   0     Min.   : 334     
##  Class :character       1st Qu.: 882       1st Qu.:   0     1st Qu.:1130     
##  Mode  :character       Median :1087       Median :   0     Median :1464     
##                         Mean   :1163       Mean   : 347     Mean   :1515     
##                         3rd Qu.:1391       3rd Qu.: 728     3rd Qu.:1777     
##                         Max.   :4692       Max.   :2065     Max.   :5642     
##  FullBathrooms      Bedrooms     KitchenQualityCondition   TotalRooms    
##  Min.   :0.000   Min.   :0.000   Length:1460             Min.   : 2.000  
##  1st Qu.:1.000   1st Qu.:2.000   Class :character        1st Qu.: 5.000  
##  Median :2.000   Median :3.000   Mode  :character        Median : 6.000  
##  Mean   :1.565   Mean   :2.866                           Mean   : 6.518  
##  3rd Qu.:2.000   3rd Qu.:3.000                           3rd Qu.: 7.000  
##  Max.   :3.000   Max.   :8.000                           Max.   :14.000  
##    GarageArea     TotalPorchAreaSquareFeet   MonthSold         YearSold   
##  Min.   :   0.0   Min.   :  0.00           Min.   : 1.000   Min.   :2006  
##  1st Qu.: 334.5   1st Qu.:  0.00           1st Qu.: 5.000   1st Qu.:2007  
##  Median : 480.0   Median : 40.00           Median : 6.000   Median :2008  
##  Mean   : 473.0   Mean   : 68.61           Mean   : 6.322   Mean   :2008  
##  3rd Qu.: 576.0   3rd Qu.:104.00           3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :1418.0   Max.   :638.00           Max.   :12.000   Max.   :2010  
##    SaleType         SaleCondition        SalePrice     
##  Length:1460        Length:1460        Min.   : 34900  
##  Class :character   Class :character   1st Qu.:129975  
##  Mode  :character   Mode  :character   Median :163000  
##                                        Mean   :180921  
##                                        3rd Qu.:214000  
##                                        Max.   :755000
library(skimr) #  More detailed summary using skimr
skim(House_pricing_data)
Data summary
Name House_pricing_data
Number of rows 1460
Number of columns 26
_______________________
Column type frequency:
character 10
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
MSZoning 0 1 10 28 0 5 0
LandSlope 0 1 11 13 0 3 0
BuildingType 0 1 6 62 0 5 0
ExteriorCondition 0 1 4 15 0 5 0
Foundation 0 1 4 15 0 6 0
HeatingQualityCondition 0 1 4 15 0 5 0
CentralAirConditioning 0 1 2 3 0 2 0
KitchenQualityCondition 0 1 4 15 0 4 0
SaleType 0 1 5 42 0 9 0
SaleCondition 0 1 6 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HouseId 0 1 730.50 421.61 1 365.75 730.5 1095.25 1460 ▇▇▇▇▇
LotAreaSquareFeet 0 1 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245 ▇▁▁▁▁
OverallCondition 0 1 5.58 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
YearBuilt 0 1 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010 ▁▂▃▆▇
TotalBasementSquareFeet 0 1 1057.43 438.71 0 795.75 991.5 1298.25 6110 ▇▃▁▁▁
1stFloorSquareFeet 0 1 1162.63 386.59 334 882.00 1087.0 1391.25 4692 ▇▅▁▁▁
2ndFlrSquareFeet 0 1 346.99 436.53 0 0.00 0.0 728.00 2065 ▇▃▂▁▁
LivAreaSquareFeet 0 1 1515.46 525.48 334 1129.50 1464.0 1776.75 5642 ▇▇▁▁▁
FullBathrooms 0 1 1.57 0.55 0 1.00 2.0 2.00 3 ▁▇▁▇▁
Bedrooms 0 1 2.87 0.82 0 2.00 3.0 3.00 8 ▁▇▂▁▁
TotalRooms 0 1 6.52 1.63 2 5.00 6.0 7.00 14 ▂▇▇▁▁
GarageArea 0 1 472.98 213.80 0 334.50 480.0 576.00 1418 ▂▇▃▁▁
TotalPorchAreaSquareFeet 0 1 68.61 85.86 0 0.00 40.0 104.00 638 ▇▂▁▁▁
MonthSold 0 1 6.32 2.70 1 5.00 6.0 8.00 12 ▃▆▇▃▃
YearSold 0 1 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▅
SalePrice 0 1 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000 ▇▅▁▁▁

CHECKING FOR DUPLICATES

sum(duplicated(House_pricing_data))
## [1] 0

There are no duplicates.

CHECKING FOR MISSING VALUES

sum(is.na(House_pricing_data))
## [1] 0

There are no missing values

VISUALIZATION

# Separate numerical columns
numerical_cols <- House_pricing_data %>%
  select_if(is.numeric)

# Drop the HouseId column from the numerical_cols dataset
numerical_cols <- numerical_cols %>%
  select(-HouseId)
colnames(numerical_cols)
##  [1] "LotAreaSquareFeet"        "OverallCondition"        
##  [3] "YearBuilt"                "TotalBasementSquareFeet" 
##  [5] "1stFloorSquareFeet"       "2ndFlrSquareFeet"        
##  [7] "LivAreaSquareFeet"        "FullBathrooms"           
##  [9] "Bedrooms"                 "TotalRooms"              
## [11] "GarageArea"               "TotalPorchAreaSquareFeet"
## [13] "MonthSold"                "YearSold"                
## [15] "SalePrice"
head(numerical_cols)
## # A tibble: 6 × 15
##   LotAreaSquareFeet OverallCondition YearBuilt TotalBasementSquareFeet
##               <dbl>            <dbl>     <dbl>                   <dbl>
## 1              8450                5      2003                     856
## 2              9600                8      1976                    1262
## 3             11250                5      2001                     920
## 4              9550                5      1915                     756
## 5             14260                5      2000                    1145
## 6             14115                5      1993                     796
## # ℹ 11 more variables: `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## #   LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## #   TotalRooms <dbl>, GarageArea <dbl>, TotalPorchAreaSquareFeet <dbl>,
## #   MonthSold <dbl>, YearSold <dbl>, SalePrice <dbl>
# Separate categorical columns
categorical_cols <- House_pricing_data %>%
  select_if(function(x) is.factor(x) | is.character(x))

colnames(categorical_cols)
##  [1] "MSZoning"                "LandSlope"              
##  [3] "BuildingType"            "ExteriorCondition"      
##  [5] "Foundation"              "HeatingQualityCondition"
##  [7] "CentralAirConditioning"  "KitchenQualityCondition"
##  [9] "SaleType"                "SaleCondition"

Visualize the Distribution of Numerical Variables

SalePrice Distribution and TotalBasementSquareFeet Distribution

# Create individual histograms for each numerical column
p1 <- ggplot(House_pricing_data, aes(x = SalePrice)) + 
  geom_histogram(fill = "blue", bins = 30, alpha = 0.5) + 
  ggtitle("SalePrice Distribution")

p2 <- ggplot(House_pricing_data, aes(x = TotalBasementSquareFeet)) + 
  geom_histogram(fill = "green", bins = 30, alpha = 0.5) + 
  ggtitle("TotalBasementSquareFeet Distribution")

grid.arrange(p1, p2)

Interpretation

1. SalePrice Distribution

2. TotalBasementSquareFeet Distribution

LotAreaSquareFeet Distribution and GarageArea Distribution

p3 <- ggplot(House_pricing_data, aes(x = LotAreaSquareFeet)) + 
  geom_histogram(fill = "red", bins = 30, alpha = 0.5) + 
  ggtitle("LotAreaSquareFeet Distribution")

p4 <- ggplot(House_pricing_data, aes(x = GarageArea)) + 
  geom_histogram(fill = "purple", bins = 30, alpha = 0.5) + 
  ggtitle("GarageArea Distribution")

grid.arrange(p3, p4)

Interpretation

1. LotAreaSquareFeet Distribution

2. GarageArea Distribution

LivAreaSquareFeet Distribution and 1stFloorSquareFeet Distribution

p5 <- ggplot(House_pricing_data, aes(x = LivAreaSquareFeet)) + 
  geom_histogram(fill = "orange", bins = 30, alpha = 0.5) + 
  ggtitle("LivAreaSquareFeet Distribution")

p6 <- ggplot(House_pricing_data, aes(x = `1stFloorSquareFeet`)) + 
  geom_histogram(fill = "yellow", bins = 30, alpha = 0.5) + 
  ggtitle("1stFloorSquareFeet Distribution")

grid.arrange(p5, p6)

Intrepretation

1. LivAreaSquareFeet Distribution

2. 1stFloorSquareFeet Distribution

OverallCondition Distribution and 2ndFlrSquareFeet Distribution

p7 <- ggplot(House_pricing_data, aes(x = OverallCondition)) + 
  geom_histogram(fill = "pink", bins = 30, alpha = 0.5) + 
  ggtitle("OverallCondition Distribution")

p8 <- ggplot(House_pricing_data, aes(x = `2ndFlrSquareFeet`)) + 
  geom_histogram(fill = "brown", bins = 30, alpha = 0.5) + 
  ggtitle("2ndFlrSquareFeet Distribution")

grid.arrange(p7, p8)

Interpretation

1. OverallCondition Distribution

2. 2ndFlrSquareFeet Distribution

TotalPorchAreaSquareFeet Distribution and TotalRooms Distribution and FullBathrooms Distribution

p9 <- ggplot(House_pricing_data, aes(x = TotalPorchAreaSquareFeet)) + 
  geom_histogram(fill = "cyan", bins = 30, alpha = 0.5) + 
  ggtitle("TotalPorchAreaSquareFeet Distribution")

p10 <- ggplot(House_pricing_data, aes(x = TotalRooms)) + 
  geom_histogram(fill = "green", bins = 30, alpha = 0.5) + 
  ggtitle("TotalRooms Distribution")

p11 <- ggplot(House_pricing_data, aes(x = FullBathrooms)) + 
  geom_histogram(fill = "blue", bins = 30, alpha = 0.5) + 
  ggtitle("FullBathrooms Distribution")

grid.arrange(p9, p10, p11 )

Interpretation

1. TotalPorchAreaSquareFeet Distribution

2. TotalRooms Distribution

3. FullBathrooms Distribution

COUNT PLOT FOR CATEGORICAL VARIABLES

MSZoning Distribution and LandSlope Distribution

# Create count plots for each categorical variable  
c1 <- ggplot(House_pricing_data, aes(x = MSZoning, fill = MSZoning)) +
  geom_bar() + 
  ggtitle("MSZoning Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

c2 <- ggplot(House_pricing_data, aes(x = LandSlope, fill = LandSlope)) +
  geom_bar() + 
  ggtitle("LandSlope Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

grid.arrange(c1, c2)

INTERPRETATION

1. MSZoning Distribution

2. LandSlope Distribution

BuildingType Distribution

c3 <- ggplot(House_pricing_data, aes(x = BuildingType, fill = BuildingType)) +
  geom_bar() + 
  ggtitle("BuildingType Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c3

INTERPRETATION

1. BuildingType Distribution

ExteriorCondition Distribution

c4 <- ggplot(House_pricing_data, aes(x = ExteriorCondition, fill = ExteriorCondition)) +
  geom_bar() + 
  ggtitle("ExteriorCondition Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

c4

INTERPRETATION

2. ExteriorCondition Distribution

Foundation Distribution and HeatingQualityCondition Distribution

c5 <- ggplot(House_pricing_data, aes(x = Foundation, fill = Foundation)) +
  geom_bar() + 
  ggtitle("Foundation Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c6 <- ggplot(House_pricing_data, aes(x = HeatingQualityCondition, fill = HeatingQualityCondition)) +
  geom_bar() + 
  ggtitle("HeatingQualityCondition Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

grid.arrange(c5, c6)

INTERPRETATION

1. Foundation Distribution

2. HeatingQualityCondition Distribution

CentralAirConditioning Distribution and KitchenQualityCondition Distribution

c7 <- ggplot(House_pricing_data, aes(x = CentralAirConditioning, fill = CentralAirConditioning)) +
  geom_bar() + 
  ggtitle("CentralAirConditioning Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

c8 <- ggplot(House_pricing_data, aes(x = KitchenQualityCondition, fill = KitchenQualityCondition)) +
  geom_bar() + 
  ggtitle("KitchenQualityCondition Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

grid.arrange(c7, c8)

INTERPRETATION

1. CentralAirConditioning Distribution

2. KitchenQualityCondition Distribution

SaleType Distribution

c9 <- ggplot(House_pricing_data, aes(x = SaleType, fill = SaleType)) +
  geom_bar() + 
  ggtitle("SaleType Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
c9

INTERPRETATION

1. SaleType Distribution

SaleCondition Distribution

c10 <- ggplot(House_pricing_data, aes(x = SaleCondition, fill = SaleCondition)) +
  geom_bar() + 
  ggtitle("SaleCondition Distribution") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

c10

INTREPRETATION

1. SaleCondition Distribution

CORRELATION HEATMAP

# Calculate the correlation matrix
cor_matrix <- cor(numerical_cols)
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, 
         number.cex = 0.7) 

Interpretation

Average SalePrice by Decade built

# Create a new column for the decade based on YearBuilt
House_pricing_data$Decade <- floor(House_pricing_data$YearBuilt / 10) * 10

# Calculate average SalePrice by Decade
average_saleprice_by_decade <- House_pricing_data %>%
  group_by(Decade) %>%
  summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))

# Line chart for average SalePrice by Decade
ggplot(average_saleprice_by_decade, aes(x = factor(Decade), y = average_saleprice)) +
  geom_line(group = 1, color = "blue", linewidth = 1) +  # Line
  geom_point(color = "red", size = 2) +             # Points
  ggtitle("Average SalePrice by Decade of YearBuilt") +
  xlab("Decade Built") +                           
  ylab("Average Sale Price") +                     
  theme(axis.text.x = element_text(angle = 45, hjust = 1))   

Interpretation

Initial Increase (1870–1890):

Sharp Decline (1890–1900):

Gradual Increase (1900–1930):

Stable and Low (1930–1970):

Rapid Increase (1980–2010):

Recent Outlier (2010):

Average SalePrice by Number of Bedrooms

# Calculate average SalePrice by Number of Bedrooms
avg_saleprice_by_bedrooms <- House_pricing_data %>%
  group_by(Bedrooms) %>%
  summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))

# Bar chart for SalePrice by Number of Bedrooms
ggplot(avg_saleprice_by_bedrooms, aes(x = factor(Bedrooms), y = average_saleprice)) + 
  geom_bar(stat = "identity", fill = "lightblue", color = "black") + 
  ggtitle("Average SalePrice by Number of Bedrooms") + 
  xlab("Number of Bedrooms") + 
  ylab("Average Sale Price") + 
  theme_minimal()

INTERPRETATION

Higher Sale Price for 0 Bedrooms (200,000+):

Relatively Higher Sale Prices for 4 and 3 Bedrooms:

Moderate Sale Prices for 2 and 5 Bedrooms:

Lower Sale Price for 6 and 8 Bedrooms:

Average SalePrice by Number of FullBathrooms

# Calculate average SalePrice by Number of FullBathrooms
avg_saleprice_by_fullbathrooms <- House_pricing_data %>%
  group_by(FullBathrooms) %>%
  summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))

# Bar chart for SalePrice by Number of FullBathrooms
ggplot(avg_saleprice_by_fullbathrooms, aes(x = factor(FullBathrooms), y = average_saleprice)) + 
  geom_bar(stat = "identity", fill = "lightgreen", color = "black") + 
  ggtitle("Average SalePrice by Number of FullBathrooms") + 
  xlab("Number of FullBathrooms") + 
  ylab("Average Sale Price") + 
  theme_minimal()

INTERPRETATION

0 Full Bathrooms:

1 Full Bathroom:

2 Full Bathrooms:

3 Full Bathrooms:

General Conclusion:

# Calculate average SalePrice by TotalRooms
avg_saleprice_by_totalrooms <- House_pricing_data %>%
  group_by(TotalRooms) %>%
  summarise(average_saleprice = mean(SalePrice, na.rm = TRUE))

# Bar chart for SalePrice by TotalRooms
ggplot(avg_saleprice_by_totalrooms, aes(x = factor(TotalRooms), y = average_saleprice)) + 
  geom_bar(stat = "identity", fill = "lightblue", color = "black") + 
  ggtitle("Average SalePrice by TotalRooms") + 
  xlab("Number of Total Rooms") + 
  ylab("Average Sale Price") + 
  theme_minimal()

INTERPRETATION

Increase in Sale Price with More Rooms:

Moderate Increase:

Drop in Price at 12 and 14 Rooms:

FEATURE ENGINEERING

We can do feature engineering to get total living area

# Create the TotalLivingArea feature by summing LivAreaSquareFeet and TotalBasementSquareFeet
House_pricing_data$TotalLivingArea <- House_pricing_data$LivAreaSquareFeet + House_pricing_data$TotalBasementSquareFeet

# Check the new feature by viewing the first few rows of the data
head(House_pricing_data$TotalLivingArea)
## [1] 2566 2524 2706 2473 3343 2158
# Scatter plot to explore the relationship between TotalLivingArea and SalePrice
ggplot(House_pricing_data, aes(x = TotalLivingArea, y = SalePrice)) +
  geom_point(color = "blue", alpha = 0.5) + 
  ggtitle("TotalLivingArea vs SalePrice") + 
  xlab("Total Living Area (sq ft)") + 
  ylab("Sale Price") + 
  theme_minimal()

INTERPRETATION

Positive Correlation:

Linear Trend:

Outliers:

Data Concentration: