EXPLORATORY DATA ANALYSIS

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded

Exporting the data

house_pricing<-read_xlsx("C:/Users/CiCi/Downloads/Datasets Statistical programming/House pricing data.xlsx")

#checking for missing values ##dataframe

colSums(is.na(house_pricing))
##                  HouseId                 MSZoning        LotAreaSquareFeet 
##                        0                        0                        0 
##                LandSlope             BuildingType         OverallCondition 
##                        0                        0                        0 
##                YearBuilt        ExteriorCondition               Foundation 
##                        0                        0                        0 
##  TotalBasementSquareFeet  HeatingQualityCondition   CentralAirConditioning 
##                        0                        0                        0 
##       1stFloorSquareFeet         2ndFlrSquareFeet        LivAreaSquareFeet 
##                        0                        0                        0 
##            FullBathrooms                 Bedrooms  KitchenQualityCondition 
##                        0                        0                        0 
##               TotalRooms               GarageArea TotalPorchAreaSquareFeet 
##                        0                        0                        0 
##                MonthSold                 YearSold                 SaleType 
##                        0                        0                        0 
##            SaleCondition                SalePrice 
##                        0                        0

##there are no mising values #view of the data

glimpse(house_pricing)
## Rows: 1,460
## Columns: 26
## $ HouseId                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ MSZoning                 <chr> "Residential Low Density", "Residential Low D…
## $ LotAreaSquareFeet        <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084,…
## $ LandSlope                <chr> "Gentleslope", "Gentleslope", "Gentleslope", …
## $ BuildingType             <chr> "Single-family Detached", "Single-family Deta…
## $ OverallCondition         <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, …
## $ YearBuilt                <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 197…
## $ ExteriorCondition        <chr> "Average/Typical", "Average/Typical", "Averag…
## $ Foundation               <chr> "Poured Contrete", "Cinder Block", "Poured Co…
## $ TotalBasementSquareFeet  <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 9…
## $ HeatingQualityCondition  <chr> "Excellent", "Excellent", "Excellent", "Good"…
## $ CentralAirConditioning   <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ `1stFloorSquareFeet`     <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1…
## $ `2ndFlrSquareFeet`       <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, …
## $ LivAreaSquareFeet        <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 209…
## $ FullBathrooms            <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, …
## $ Bedrooms                 <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, …
## $ KitchenQualityCondition  <chr> "Good", "Typical/Average", "Good", "Good", "G…
## $ TotalRooms               <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5,…
## $ GarageArea               <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, …
## $ TotalPorchAreaSquareFeet <dbl> 61, 0, 42, 307, 84, 30, 57, 432, 205, 4, 0, 2…
## $ MonthSold                <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, …
## $ YearSold                 <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 200…
## $ SaleType                 <chr> "Warranty Deed - Conventional", "Warranty Dee…
## $ SaleCondition            <chr> "Normal", "Normal", "Normal", "Abnorml", "Nor…
## $ SalePrice                <dbl> 208500, 181500, 223500, 140000, 250000, 14300…

#Checking for ouliers #boxplots

library(ggplot2)
ggplot(data=house_pricing, aes(y=HouseId))+
  geom_boxplot(fill="blue")

ggplot(data=house_pricing, aes(y=LotAreaSquareFeet))+
  geom_boxplot(fill="brown")

ggplot(data=house_pricing, aes(y=OverallCondition))+
  geom_boxplot(fill="purple")

ggplot(data=house_pricing, aes(y=TotalBasementSquareFeet))+
  geom_boxplot(fill="red")

ggplot(data=house_pricing, aes(y=LivAreaSquareFeet))+
  geom_boxplot(fill="black")

ggplot(data=house_pricing, aes(y=FullBathrooms))+
  geom_boxplot(fill="brown")

ggplot(data=house_pricing, aes(y=Bedrooms))+
  geom_boxplot(fill="lightblue")

ggplot(data=house_pricing, aes(y=TotalRooms))+
  geom_boxplot(fill="pink")

ggplot(data=house_pricing, aes(y=GarageArea))+
  geom_boxplot(fill="gold")

ggplot(data=house_pricing, aes(y=TotalPorchAreaSquareFeet))+
  geom_boxplot(fill="blue")

ggplot(data=house_pricing, aes(y=SalePrice))+
  geom_boxplot(fill="blue")

#Interpretation of the boxplots ##among the numerical variables HouseId and fullbathrooms have no outliers. ##The ones with long boxplots like Yearsold and HouseId have a high IQR and a large spread while those with narrow boxplots i.e. LotAreaSquareFeet and overall condition have a low IQR and have a small spread. #Scatterpplots

ggplot(data=house_pricing, aes(x=Bedrooms, y=SalePrice))+
  geom_point(color="brown")+
  facet_wrap(~SaleType)

#The scatter plots show that the data has ouliers #Histogram

ggplot(data=house_pricing, aes(x=HouseId))+
  geom_histogram(fill="purple", binwidth = 5)+
  labs(y="TotalPorchAreaSquareFeet", x="HouseId", title="Histogram")+
  facet_wrap("SaleType")

##most bins are not of equal heights thus the data is not uniformly distributed. #An estimate of probability density function

ggplot(data=house_pricing, aes(x=OverallCondition))+
  geom_density(color="black")+
  facet_wrap("MSZoning")

##the data is mostly right skewed #Baplots

house_pricing %>%
  group_by(LandSlope, MSZoning) %>%
  summarise(mean_SalePrice = mean(SalePrice))
## `summarise()` has grouped output by 'LandSlope'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 3
## # Groups:   LandSlope [3]
##    LandSlope     MSZoning                     mean_SalePrice
##    <chr>         <chr>                                 <dbl>
##  1 Gentleslope   Commercial                           68359.
##  2 Gentleslope   Floating Village Residential        214014.
##  3 Gentleslope   Residential High Density            135062.
##  4 Gentleslope   Residential Low Density             190059.
##  5 Gentleslope   Residential Medium Density          124458.
##  6 ModerateSlope Commercial                           88923 
##  7 ModerateSlope Residential High Density             79000 
##  8 ModerateSlope Residential Low Density             206769.
##  9 ModerateSlope Residential Medium Density          182343.
## 10 SevereSlope   Residential Low Density             204379.

#Correlation and pairwise plots

library(corrr)
## Warning: package 'corrr' was built under R version 4.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
correlation_matrix <- house_pricing %>% correlate()
## Non-numeric variables removed from input: `MSZoning`, `LandSlope`, `BuildingType`, `ExteriorCondition`, `Foundation`, `HeatingQualityCondition`, `CentralAirConditioning`, `KitchenQualityCondition`, `SaleType`, and `SaleCondition`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
print(correlation_matrix)
## # A tibble: 16 × 17
##    term                     HouseId LotAreaSquareFeet OverallCondition YearBuilt
##    <chr>                      <dbl>             <dbl>            <dbl>     <dbl>
##  1 HouseId                 NA                -0.0332           0.0126    -0.0127
##  2 LotAreaSquareFeet       -3.32e-2          NA               -0.00564    0.0142
##  3 OverallCondition         1.26e-2          -0.00564         NA         -0.376 
##  4 YearBuilt               -1.27e-2           0.0142          -0.376     NA     
##  5 TotalBasementSquareFeet -1.54e-2           0.261           -0.171      0.391 
##  6 1stFloorSquareFeet       1.05e-2           0.299           -0.144      0.282 
##  7 2ndFlrSquareFeet         5.59e-3           0.0510           0.0289     0.0103
##  8 LivAreaSquareFeet        8.27e-3           0.263           -0.0797     0.199 
##  9 FullBathrooms            5.59e-3           0.126           -0.194      0.468 
## 10 Bedrooms                 3.77e-2           0.120            0.0130    -0.0707
## 11 TotalRooms               2.72e-2           0.190           -0.0576     0.0956
## 12 GarageArea               1.76e-2           0.180           -0.152      0.479 
## 13 TotalPorchAreaSquareFe…  1.69e-3           0.0524           0.0249    -0.130 
## 14 MonthSold                2.12e-2           0.00120         -0.00351    0.0124
## 15 YearSold                 7.12e-4          -0.0143           0.0439    -0.0136
## 16 SalePrice               -2.19e-2           0.264           -0.0779     0.523 
## # ℹ 12 more variables: TotalBasementSquareFeet <dbl>,
## #   `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## #   LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## #   TotalRooms <dbl>, GarageArea <dbl>, TotalPorchAreaSquareFeet <dbl>,
## #   MonthSold <dbl>, YearSold <dbl>, SalePrice <dbl>
rplot(correlation_matrix)

#Interpretation of correlation ##0-0.3, the variables are a low correlation ##0.4-0.7, are moderately correlated ##0.8-1 are highly correlated ## the negative correlations show indirect relationship while positive correlations show direct relationship. #Pairwise

pairwise_plot <- ggpairs(house_pricing)

#Statistical methods

qqnorm(house_pricing$HouseId)
qqline(house_pricing$HouseId)

qqnorm(house_pricing$LotAreaSquareFeet)
qqline(house_pricing$LotAreaSquareFeet)

##the points fall close to the qqline thus data is approximately normally distributed ###the points curve upwards therefore the data is rightly/ positively skewed. #test for normality

shapiro.test(house_pricing$Bedrooms)
## 
##  Shapiro-Wilk normality test
## 
## data:  house_pricing$Bedrooms
## W = 0.8498, p-value < 2.2e-16
shapiro.test(house_pricing$OverallCondition)
## 
##  Shapiro-Wilk normality test
## 
## data:  house_pricing$OverallCondition
## W = 0.82892, p-value < 2.2e-16
shapiro.test(house_pricing$TotalBasementSquareFeet)
## 
##  Shapiro-Wilk normality test
## 
## data:  house_pricing$TotalBasementSquareFeet
## W = 0.91735, p-value < 2.2e-16

##the pvalue is < 0.05(significance level)thus we do not reject H0, the data is normally distributed. #T Test ##the mean was 730.5

summary(house_pricing)
##     HouseId         MSZoning         LotAreaSquareFeet  LandSlope        
##  Min.   :   1.0   Length:1460        Min.   :  1300    Length:1460       
##  1st Qu.: 365.8   Class :character   1st Qu.:  7554    Class :character  
##  Median : 730.5   Mode  :character   Median :  9478    Mode  :character  
##  Mean   : 730.5                      Mean   : 10517                      
##  3rd Qu.:1095.2                      3rd Qu.: 11602                      
##  Max.   :1460.0                      Max.   :215245                      
##  BuildingType       OverallCondition   YearBuilt    ExteriorCondition 
##  Length:1460        Min.   :1.000    Min.   :1872   Length:1460       
##  Class :character   1st Qu.:5.000    1st Qu.:1954   Class :character  
##  Mode  :character   Median :5.000    Median :1973   Mode  :character  
##                     Mean   :5.575    Mean   :1971                     
##                     3rd Qu.:6.000    3rd Qu.:2000                     
##                     Max.   :9.000    Max.   :2010                     
##   Foundation        TotalBasementSquareFeet HeatingQualityCondition
##  Length:1460        Min.   :   0.0          Length:1460            
##  Class :character   1st Qu.: 795.8          Class :character       
##  Mode  :character   Median : 991.5          Mode  :character       
##                     Mean   :1057.4                                 
##                     3rd Qu.:1298.2                                 
##                     Max.   :6110.0                                 
##  CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
##  Length:1460            Min.   : 334       Min.   :   0     Min.   : 334     
##  Class :character       1st Qu.: 882       1st Qu.:   0     1st Qu.:1130     
##  Mode  :character       Median :1087       Median :   0     Median :1464     
##                         Mean   :1163       Mean   : 347     Mean   :1515     
##                         3rd Qu.:1391       3rd Qu.: 728     3rd Qu.:1777     
##                         Max.   :4692       Max.   :2065     Max.   :5642     
##  FullBathrooms      Bedrooms     KitchenQualityCondition   TotalRooms    
##  Min.   :0.000   Min.   :0.000   Length:1460             Min.   : 2.000  
##  1st Qu.:1.000   1st Qu.:2.000   Class :character        1st Qu.: 5.000  
##  Median :2.000   Median :3.000   Mode  :character        Median : 6.000  
##  Mean   :1.565   Mean   :2.866                           Mean   : 6.518  
##  3rd Qu.:2.000   3rd Qu.:3.000                           3rd Qu.: 7.000  
##  Max.   :3.000   Max.   :8.000                           Max.   :14.000  
##    GarageArea     TotalPorchAreaSquareFeet   MonthSold         YearSold   
##  Min.   :   0.0   Min.   :  0.00           Min.   : 1.000   Min.   :2006  
##  1st Qu.: 334.5   1st Qu.:  0.00           1st Qu.: 5.000   1st Qu.:2007  
##  Median : 480.0   Median : 40.00           Median : 6.000   Median :2008  
##  Mean   : 473.0   Mean   : 68.61           Mean   : 6.322   Mean   :2008  
##  3rd Qu.: 576.0   3rd Qu.:104.00           3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :1418.0   Max.   :638.00           Max.   :12.000   Max.   :2010  
##    SaleType         SaleCondition        SalePrice     
##  Length:1460        Length:1460        Min.   : 34900  
##  Class :character   Class :character   1st Qu.:129975  
##  Mode  :character   Mode  :character   Median :163000  
##                                        Mean   :180921  
##                                        3rd Qu.:214000  
##                                        Max.   :755000

#Making the categorical data recognizable

HousePrice<-house_pricing%>%
  mutate(MSZoning2=as.factor(MSZoning))
group1<-rnorm(1000, mean = 5.575, sd= 1)
group2<-rnorm(1000, mean = 10, sd= 1)
t.test(group1, group2, alternative="greater")
## 
##  Welch Two Sample t-test
## 
## data:  group1 and group2
## t = -100.43, df = 1993.4, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -4.555524       Inf
## sample estimates:
## mean of x mean of y 
##  5.519135 10.001215
t.test(group1, group2)
## 
##  Welch Two Sample t-test
## 
## data:  group1 and group2
## t = -100.43, df = 1993.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.569606 -4.394555
## sample estimates:
## mean of x mean of y 
##  5.519135 10.001215

#Test for goodness fit

table=xtabs(HouseId~SalePrice+SaleCondition, house_pricing)
chisq.test(table)
## Warning in chisq.test(table): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table
## X-squared = 2909299, df = 3310, p-value < 2.2e-16

##for chisq the p value is < 0.05 thus we reject H0 and conclude that the 2 variables are not independent. #ANOVA test ##One way ANOVA

aov(TotalBasementSquareFeet~ BuildingType, data=house_pricing)
## Call:
##    aov(formula = TotalBasementSquareFeet ~ BuildingType, data = house_pricing)
## 
## Terms:
##                 BuildingType Residuals
## Sum of Squares       6294154 274508431
## Deg. of Freedom            4      1455
## 
## Residual standard error: 434.3565
## Estimated effects may be unbalanced

##Two way ANOVA

aov(SalePrice~ExteriorCondition*MSZoning, data = house_pricing)
## Call:
##    aov(formula = SalePrice ~ ExteriorCondition * MSZoning, data = house_pricing)
## 
## Terms:
##                 ExteriorCondition     MSZoning ExteriorCondition:MSZoning
## Sum of Squares       2.174690e+11 8.966145e+11               7.862979e+10
## Deg. of Freedom                 4            4                          7
##                    Residuals
## Sum of Squares  8.015198e+12
## Deg. of Freedom         1444
## 
## Residual standard error: 74502.96
## 9 out of 25 effects not estimable
## Estimated effects may be unbalanced

##ANOVA shows if the means are different. ###THE END!!!