library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readxl)
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
house_pricing<-read_xlsx("C:/Users/CiCi/Downloads/Datasets Statistical programming/House pricing data.xlsx")
#checking for missing values ##dataframe
colSums(is.na(house_pricing))
## HouseId MSZoning LotAreaSquareFeet
## 0 0 0
## LandSlope BuildingType OverallCondition
## 0 0 0
## YearBuilt ExteriorCondition Foundation
## 0 0 0
## TotalBasementSquareFeet HeatingQualityCondition CentralAirConditioning
## 0 0 0
## 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
## 0 0 0
## FullBathrooms Bedrooms KitchenQualityCondition
## 0 0 0
## TotalRooms GarageArea TotalPorchAreaSquareFeet
## 0 0 0
## MonthSold YearSold SaleType
## 0 0 0
## SaleCondition SalePrice
## 0 0
##there are no mising values #view of the data
glimpse(house_pricing)
## Rows: 1,460
## Columns: 26
## $ HouseId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…
## $ MSZoning <chr> "Residential Low Density", "Residential Low D…
## $ LotAreaSquareFeet <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084,…
## $ LandSlope <chr> "Gentleslope", "Gentleslope", "Gentleslope", …
## $ BuildingType <chr> "Single-family Detached", "Single-family Deta…
## $ OverallCondition <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, …
## $ YearBuilt <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 197…
## $ ExteriorCondition <chr> "Average/Typical", "Average/Typical", "Averag…
## $ Foundation <chr> "Poured Contrete", "Cinder Block", "Poured Co…
## $ TotalBasementSquareFeet <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 9…
## $ HeatingQualityCondition <chr> "Excellent", "Excellent", "Excellent", "Good"…
## $ CentralAirConditioning <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
## $ `1stFloorSquareFeet` <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1…
## $ `2ndFlrSquareFeet` <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, …
## $ LivAreaSquareFeet <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 209…
## $ FullBathrooms <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, …
## $ Bedrooms <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, …
## $ KitchenQualityCondition <chr> "Good", "Typical/Average", "Good", "Good", "G…
## $ TotalRooms <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5,…
## $ GarageArea <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, …
## $ TotalPorchAreaSquareFeet <dbl> 61, 0, 42, 307, 84, 30, 57, 432, 205, 4, 0, 2…
## $ MonthSold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, …
## $ YearSold <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 200…
## $ SaleType <chr> "Warranty Deed - Conventional", "Warranty Dee…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Nor…
## $ SalePrice <dbl> 208500, 181500, 223500, 140000, 250000, 14300…
#Checking for ouliers #boxplots
library(ggplot2)
ggplot(data=house_pricing, aes(y=HouseId))+
geom_boxplot(fill="blue")
ggplot(data=house_pricing, aes(y=LotAreaSquareFeet))+
geom_boxplot(fill="brown")
ggplot(data=house_pricing, aes(y=OverallCondition))+
geom_boxplot(fill="purple")
ggplot(data=house_pricing, aes(y=TotalBasementSquareFeet))+
geom_boxplot(fill="red")
ggplot(data=house_pricing, aes(y=LivAreaSquareFeet))+
geom_boxplot(fill="black")
ggplot(data=house_pricing, aes(y=FullBathrooms))+
geom_boxplot(fill="brown")
ggplot(data=house_pricing, aes(y=Bedrooms))+
geom_boxplot(fill="lightblue")
ggplot(data=house_pricing, aes(y=TotalRooms))+
geom_boxplot(fill="pink")
ggplot(data=house_pricing, aes(y=GarageArea))+
geom_boxplot(fill="gold")
ggplot(data=house_pricing, aes(y=TotalPorchAreaSquareFeet))+
geom_boxplot(fill="blue")
ggplot(data=house_pricing, aes(y=SalePrice))+
geom_boxplot(fill="blue")
#Interpretation of the boxplots ##among the numerical variables HouseId
and fullbathrooms have no outliers. ##The ones with long boxplots like
Yearsold and HouseId have a high IQR and a large spread while those with
narrow boxplots i.e. LotAreaSquareFeet and overall condition have a low
IQR and have a small spread. #Scatterpplots
ggplot(data=house_pricing, aes(x=Bedrooms, y=SalePrice))+
geom_point(color="brown")+
facet_wrap(~SaleType)
#The scatter plots show that the data has ouliers #Histogram
ggplot(data=house_pricing, aes(x=HouseId))+
geom_histogram(fill="purple", binwidth = 5)+
labs(y="TotalPorchAreaSquareFeet", x="HouseId", title="Histogram")+
facet_wrap("SaleType")
##most bins are not of equal heights thus the data is not uniformly
distributed. #An estimate of probability density function
ggplot(data=house_pricing, aes(x=OverallCondition))+
geom_density(color="black")+
facet_wrap("MSZoning")
##the data is mostly right skewed #Baplots
house_pricing %>%
group_by(LandSlope, MSZoning) %>%
summarise(mean_SalePrice = mean(SalePrice))
## `summarise()` has grouped output by 'LandSlope'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 3
## # Groups: LandSlope [3]
## LandSlope MSZoning mean_SalePrice
## <chr> <chr> <dbl>
## 1 Gentleslope Commercial 68359.
## 2 Gentleslope Floating Village Residential 214014.
## 3 Gentleslope Residential High Density 135062.
## 4 Gentleslope Residential Low Density 190059.
## 5 Gentleslope Residential Medium Density 124458.
## 6 ModerateSlope Commercial 88923
## 7 ModerateSlope Residential High Density 79000
## 8 ModerateSlope Residential Low Density 206769.
## 9 ModerateSlope Residential Medium Density 182343.
## 10 SevereSlope Residential Low Density 204379.
#Correlation and pairwise plots
library(corrr)
## Warning: package 'corrr' was built under R version 4.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
correlation_matrix <- house_pricing %>% correlate()
## Non-numeric variables removed from input: `MSZoning`, `LandSlope`, `BuildingType`, `ExteriorCondition`, `Foundation`, `HeatingQualityCondition`, `CentralAirConditioning`, `KitchenQualityCondition`, `SaleType`, and `SaleCondition`
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
print(correlation_matrix)
## # A tibble: 16 × 17
## term HouseId LotAreaSquareFeet OverallCondition YearBuilt
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 HouseId NA -0.0332 0.0126 -0.0127
## 2 LotAreaSquareFeet -3.32e-2 NA -0.00564 0.0142
## 3 OverallCondition 1.26e-2 -0.00564 NA -0.376
## 4 YearBuilt -1.27e-2 0.0142 -0.376 NA
## 5 TotalBasementSquareFeet -1.54e-2 0.261 -0.171 0.391
## 6 1stFloorSquareFeet 1.05e-2 0.299 -0.144 0.282
## 7 2ndFlrSquareFeet 5.59e-3 0.0510 0.0289 0.0103
## 8 LivAreaSquareFeet 8.27e-3 0.263 -0.0797 0.199
## 9 FullBathrooms 5.59e-3 0.126 -0.194 0.468
## 10 Bedrooms 3.77e-2 0.120 0.0130 -0.0707
## 11 TotalRooms 2.72e-2 0.190 -0.0576 0.0956
## 12 GarageArea 1.76e-2 0.180 -0.152 0.479
## 13 TotalPorchAreaSquareFe… 1.69e-3 0.0524 0.0249 -0.130
## 14 MonthSold 2.12e-2 0.00120 -0.00351 0.0124
## 15 YearSold 7.12e-4 -0.0143 0.0439 -0.0136
## 16 SalePrice -2.19e-2 0.264 -0.0779 0.523
## # ℹ 12 more variables: TotalBasementSquareFeet <dbl>,
## # `1stFloorSquareFeet` <dbl>, `2ndFlrSquareFeet` <dbl>,
## # LivAreaSquareFeet <dbl>, FullBathrooms <dbl>, Bedrooms <dbl>,
## # TotalRooms <dbl>, GarageArea <dbl>, TotalPorchAreaSquareFeet <dbl>,
## # MonthSold <dbl>, YearSold <dbl>, SalePrice <dbl>
rplot(correlation_matrix)
#Interpretation of correlation ##0-0.3, the variables are a low
correlation ##0.4-0.7, are moderately correlated ##0.8-1 are highly
correlated ## the negative correlations show indirect relationship while
positive correlations show direct relationship. #Pairwise
pairwise_plot <- ggpairs(house_pricing)
#Statistical methods
qqnorm(house_pricing$HouseId)
qqline(house_pricing$HouseId)
qqnorm(house_pricing$LotAreaSquareFeet)
qqline(house_pricing$LotAreaSquareFeet)
##the points fall close to the qqline thus data is approximately
normally distributed ###the points curve upwards therefore the data is
rightly/ positively skewed. #test for normality
shapiro.test(house_pricing$Bedrooms)
##
## Shapiro-Wilk normality test
##
## data: house_pricing$Bedrooms
## W = 0.8498, p-value < 2.2e-16
shapiro.test(house_pricing$OverallCondition)
##
## Shapiro-Wilk normality test
##
## data: house_pricing$OverallCondition
## W = 0.82892, p-value < 2.2e-16
shapiro.test(house_pricing$TotalBasementSquareFeet)
##
## Shapiro-Wilk normality test
##
## data: house_pricing$TotalBasementSquareFeet
## W = 0.91735, p-value < 2.2e-16
##the pvalue is < 0.05(significance level)thus we do not reject H0, the data is normally distributed. #T Test ##the mean was 730.5
summary(house_pricing)
## HouseId MSZoning LotAreaSquareFeet LandSlope
## Min. : 1.0 Length:1460 Min. : 1300 Length:1460
## 1st Qu.: 365.8 Class :character 1st Qu.: 7554 Class :character
## Median : 730.5 Mode :character Median : 9478 Mode :character
## Mean : 730.5 Mean : 10517
## 3rd Qu.:1095.2 3rd Qu.: 11602
## Max. :1460.0 Max. :215245
## BuildingType OverallCondition YearBuilt ExteriorCondition
## Length:1460 Min. :1.000 Min. :1872 Length:1460
## Class :character 1st Qu.:5.000 1st Qu.:1954 Class :character
## Mode :character Median :5.000 Median :1973 Mode :character
## Mean :5.575 Mean :1971
## 3rd Qu.:6.000 3rd Qu.:2000
## Max. :9.000 Max. :2010
## Foundation TotalBasementSquareFeet HeatingQualityCondition
## Length:1460 Min. : 0.0 Length:1460
## Class :character 1st Qu.: 795.8 Class :character
## Mode :character Median : 991.5 Mode :character
## Mean :1057.4
## 3rd Qu.:1298.2
## Max. :6110.0
## CentralAirConditioning 1stFloorSquareFeet 2ndFlrSquareFeet LivAreaSquareFeet
## Length:1460 Min. : 334 Min. : 0 Min. : 334
## Class :character 1st Qu.: 882 1st Qu.: 0 1st Qu.:1130
## Mode :character Median :1087 Median : 0 Median :1464
## Mean :1163 Mean : 347 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :5642
## FullBathrooms Bedrooms KitchenQualityCondition TotalRooms
## Min. :0.000 Min. :0.000 Length:1460 Min. : 2.000
## 1st Qu.:1.000 1st Qu.:2.000 Class :character 1st Qu.: 5.000
## Median :2.000 Median :3.000 Mode :character Median : 6.000
## Mean :1.565 Mean :2.866 Mean : 6.518
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 7.000
## Max. :3.000 Max. :8.000 Max. :14.000
## GarageArea TotalPorchAreaSquareFeet MonthSold YearSold
## Min. : 0.0 Min. : 0.00 Min. : 1.000 Min. :2006
## 1st Qu.: 334.5 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007
## Median : 480.0 Median : 40.00 Median : 6.000 Median :2008
## Mean : 473.0 Mean : 68.61 Mean : 6.322 Mean :2008
## 3rd Qu.: 576.0 3rd Qu.:104.00 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :1418.0 Max. :638.00 Max. :12.000 Max. :2010
## SaleType SaleCondition SalePrice
## Length:1460 Length:1460 Min. : 34900
## Class :character Class :character 1st Qu.:129975
## Mode :character Mode :character Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
#Making the categorical data recognizable
HousePrice<-house_pricing%>%
mutate(MSZoning2=as.factor(MSZoning))
group1<-rnorm(1000, mean = 5.575, sd= 1)
group2<-rnorm(1000, mean = 10, sd= 1)
t.test(group1, group2, alternative="greater")
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = -100.43, df = 1993.4, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -4.555524 Inf
## sample estimates:
## mean of x mean of y
## 5.519135 10.001215
t.test(group1, group2)
##
## Welch Two Sample t-test
##
## data: group1 and group2
## t = -100.43, df = 1993.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.569606 -4.394555
## sample estimates:
## mean of x mean of y
## 5.519135 10.001215
#Test for goodness fit
table=xtabs(HouseId~SalePrice+SaleCondition, house_pricing)
chisq.test(table)
## Warning in chisq.test(table): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table
## X-squared = 2909299, df = 3310, p-value < 2.2e-16
##for chisq the p value is < 0.05 thus we reject H0 and conclude that the 2 variables are not independent. #ANOVA test ##One way ANOVA
aov(TotalBasementSquareFeet~ BuildingType, data=house_pricing)
## Call:
## aov(formula = TotalBasementSquareFeet ~ BuildingType, data = house_pricing)
##
## Terms:
## BuildingType Residuals
## Sum of Squares 6294154 274508431
## Deg. of Freedom 4 1455
##
## Residual standard error: 434.3565
## Estimated effects may be unbalanced
##Two way ANOVA
aov(SalePrice~ExteriorCondition*MSZoning, data = house_pricing)
## Call:
## aov(formula = SalePrice ~ ExteriorCondition * MSZoning, data = house_pricing)
##
## Terms:
## ExteriorCondition MSZoning ExteriorCondition:MSZoning
## Sum of Squares 2.174690e+11 8.966145e+11 7.862979e+10
## Deg. of Freedom 4 4 7
## Residuals
## Sum of Squares 8.015198e+12
## Deg. of Freedom 1444
##
## Residual standard error: 74502.96
## 9 out of 25 effects not estimable
## Estimated effects may be unbalanced
##ANOVA shows if the means are different. ###THE END!!!