title: “Untitled” author: “Teddy Wambua 21/05600” date: “2025-03-05” output: html_document

## Load Libraries and Data

library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
df <- read_xlsx("C:/Users/USER/Desktop/House pricing data.xlsx")
## Data Overview

# Check structure and summary
str(df)
## tibble [1,460 × 26] (S3: tbl_df/tbl/data.frame)
##  $ HouseId                 : num [1:1460] 1 2 3 4 5 6 7 8 9 10 ...
##  $ MSZoning                : chr [1:1460] "Residential Low Density" "Residential Low Density" "Residential Low Density" "Residential Low Density" ...
##  $ LotAreaSquareFeet       : num [1:1460] 8450 9600 11250 9550 14260 ...
##  $ LandSlope               : chr [1:1460] "Gentleslope" "Gentleslope" "Gentleslope" "Gentleslope" ...
##  $ BuildingType            : chr [1:1460] "Single-family Detached" "Single-family Detached" "Single-family Detached" "Single-family Detached" ...
##  $ OverallCondition        : num [1:1460] 5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt               : num [1:1460] 2003 1976 2001 1915 2000 ...
##  $ ExteriorCondition       : chr [1:1460] "Average/Typical" "Average/Typical" "Average/Typical" "Average/Typical" ...
##  $ Foundation              : chr [1:1460] "Poured Contrete" "Cinder Block" "Poured Contrete" "Brick & Tile" ...
##  $ TotalBasementSquareFeet : num [1:1460] 856 1262 920 756 1145 ...
##  $ HeatingQualityCondition : chr [1:1460] "Excellent" "Excellent" "Excellent" "Good" ...
##  $ CentralAirConditioning  : chr [1:1460] "Yes" "Yes" "Yes" "Yes" ...
##  $ 1stFloorSquareFeet      : num [1:1460] 856 1262 920 961 1145 ...
##  $ 2ndFlrSquareFeet        : num [1:1460] 854 0 866 756 1053 ...
##  $ LivAreaSquareFeet       : num [1:1460] 1710 1262 1786 1717 2198 ...
##  $ FullBathrooms           : num [1:1460] 2 2 2 1 2 1 2 2 2 1 ...
##  $ Bedrooms                : num [1:1460] 3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenQualityCondition : chr [1:1460] "Good" "Typical/Average" "Good" "Good" ...
##  $ TotalRooms              : num [1:1460] 8 6 6 7 9 5 7 7 8 5 ...
##  $ GarageArea              : num [1:1460] 548 460 608 642 836 480 636 484 468 205 ...
##  $ TotalPorchAreaSquareFeet: num [1:1460] 61 0 42 307 84 30 57 432 205 4 ...
##  $ MonthSold               : num [1:1460] 2 5 9 2 12 10 8 11 4 1 ...
##  $ YearSold                : num [1:1460] 2008 2007 2008 2006 2008 ...
##  $ SaleType                : chr [1:1460] "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" "Warranty Deed - Conventional" ...
##  $ SaleCondition           : chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice               : num [1:1460] 208500 181500 223500 140000 250000 ...
summary(df)
##     HouseId         MSZoning         LotAreaSquareFeet  LandSlope        
##  Min.   :   1.0   Length:1460        Min.   :  1300    Length:1460       
##  1st Qu.: 365.8   Class :character   1st Qu.:  7554    Class :character  
##  Median : 730.5   Mode  :character   Median :  9478    Mode  :character  
##  Mean   : 730.5                      Mean   : 10517                      
##  3rd Qu.:1095.2                      3rd Qu.: 11602                      
##  Max.   :1460.0                      Max.   :215245                      
##  BuildingType       OverallCondition   YearBuilt    ExteriorCondition 
##  Length:1460        Min.   :1.000    Min.   :1872   Length:1460       
##  Class :character   1st Qu.:5.000    1st Qu.:1954   Class :character  
##  Mode  :character   Median :5.000    Median :1973   Mode  :character  
##                     Mean   :5.575    Mean   :1971                     
##                     3rd Qu.:6.000    3rd Qu.:2000                     
##                     Max.   :9.000    Max.   :2010                     
##   Foundation        TotalBasementSquareFeet HeatingQualityCondition
##  Length:1460        Min.   :   0.0          Length:1460            
##  Class :character   1st Qu.: 795.8          Class :character       
##  Mode  :character   Median : 991.5          Mode  :character       
##                     Mean   :1057.4                                 
##                     3rd Qu.:1298.2                                 
##                     Max.   :6110.0                                 
##  CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
##  Length:1460            Min.   : 334       Min.   :   0     Min.   : 334     
##  Class :character       1st Qu.: 882       1st Qu.:   0     1st Qu.:1130     
##  Mode  :character       Median :1087       Median :   0     Median :1464     
##                         Mean   :1163       Mean   : 347     Mean   :1515     
##                         3rd Qu.:1391       3rd Qu.: 728     3rd Qu.:1777     
##                         Max.   :4692       Max.   :2065     Max.   :5642     
##  FullBathrooms      Bedrooms     KitchenQualityCondition   TotalRooms    
##  Min.   :0.000   Min.   :0.000   Length:1460             Min.   : 2.000  
##  1st Qu.:1.000   1st Qu.:2.000   Class :character        1st Qu.: 5.000  
##  Median :2.000   Median :3.000   Mode  :character        Median : 6.000  
##  Mean   :1.565   Mean   :2.866                           Mean   : 6.518  
##  3rd Qu.:2.000   3rd Qu.:3.000                           3rd Qu.: 7.000  
##  Max.   :3.000   Max.   :8.000                           Max.   :14.000  
##    GarageArea     TotalPorchAreaSquareFeet   MonthSold         YearSold   
##  Min.   :   0.0   Min.   :  0.00           Min.   : 1.000   Min.   :2006  
##  1st Qu.: 334.5   1st Qu.:  0.00           1st Qu.: 5.000   1st Qu.:2007  
##  Median : 480.0   Median : 40.00           Median : 6.000   Median :2008  
##  Mean   : 473.0   Mean   : 68.61           Mean   : 6.322   Mean   :2008  
##  3rd Qu.: 576.0   3rd Qu.:104.00           3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :1418.0   Max.   :638.00           Max.   :12.000   Max.   :2010  
##    SaleType         SaleCondition        SalePrice     
##  Length:1460        Length:1460        Min.   : 34900  
##  Class :character   Class :character   1st Qu.:129975  
##  Mode  :character   Mode  :character   Median :163000  
##                                        Mean   :180921  
##                                        3rd Qu.:214000  
##                                        Max.   :755000
# Check for missing values
colSums(is.na(df))
##                  HouseId                 MSZoning        LotAreaSquareFeet 
##                        0                        0                        0 
##                LandSlope             BuildingType         OverallCondition 
##                        0                        0                        0 
##                YearBuilt        ExteriorCondition               Foundation 
##                        0                        0                        0 
##  TotalBasementSquareFeet  HeatingQualityCondition   CentralAirConditioning 
##                        0                        0                        0 
##       1stFloorSquareFeet         2ndFlrSquareFeet        LivAreaSquareFeet 
##                        0                        0                        0 
##            FullBathrooms                 Bedrooms  KitchenQualityCondition 
##                        0                        0                        0 
##               TotalRooms               GarageArea TotalPorchAreaSquareFeet 
##                        0                        0                        0 
##                MonthSold                 YearSold                 SaleType 
##                        0                        0                        0 
##            SaleCondition                SalePrice 
##                        0                        0
## Exploratory Data Analysis

### Distribution of Sale Prices
ggplot(df, aes(x = SalePrice)) +
  geom_histogram(binwidth = 10000, fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Sale Prices", x = "Sale Price", y = "Count")

### Correlation Analysis
numeric_cols <- select_if(df, is.numeric)
corr_matrix <- cor(numeric_cols)
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.7)

### Sale Price vs Living Area

ggplot(df, aes(x = LivAreaSquareFeet, y = SalePrice)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Sale Price vs Living Area", x = "Living Area (sq ft)", y = "Sale Price")
## `geom_smooth()` using formula = 'y ~ x'

### Sale Price by Building Type

ggplot(df, aes(x = BuildingType, y = SalePrice, fill = BuildingType)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Sale Price by Building Type", x = "Building Type", y = "Sale Price")

INTERPRETATION OF FINDINGS

1. Sale Price Distribution

2. Relationship Between Living Area and Sale Price

3. Variation in Sale Prices by Building Type

4. Correlation Analysis and Key Influencing Variables