```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)
Display the first few rows of the data
head(ames)
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1 1 526301100 20 RL 141 31770 Pave <NA>
## 2 2 526350040 20 RH 80 11622 Pave <NA>
## 3 3 526351010 20 RL 81 14267 Pave <NA>
## 4 4 526353030 20 RL 93 11160 Pave <NA>
## 5 5 527105010 60 RL 74 13830 Pave <NA>
## 6 6 527105030 60 RL 78 9978 Pave <NA>
## Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1 IR1 Lvl AllPub Corner Gtl NAmes
## 2 Reg Lvl AllPub Inside Gtl NAmes
## 3 IR1 Lvl AllPub Corner Gtl NAmes
## 4 Reg Lvl AllPub Corner Gtl NAmes
## 5 IR1 Lvl AllPub Inside Gtl Gilbert
## 6 IR1 Lvl AllPub Inside Gtl Gilbert
## Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1 Norm Norm 1Fam 1Story 6 5
## 2 Feedr Norm 1Fam 1Story 5 6
## 3 Norm Norm 1Fam 1Story 6 6
## 4 Norm Norm 1Fam 1Story 7 5
## 5 Norm Norm 1Fam 2Story 5 5
## 6 Norm Norm 1Fam 2Story 6 6
## Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1 1960 1960 Hip CompShg BrkFace Plywood
## 2 1961 1961 Gable CompShg VinylSd VinylSd
## 3 1958 1958 Hip CompShg Wd Sdng Wd Sdng
## 4 1968 1968 Hip CompShg BrkFace BrkFace
## 5 1997 1998 Gable CompShg VinylSd VinylSd
## 6 1998 1998 Gable CompShg VinylSd VinylSd
## Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1 Stone 112 TA TA CBlock TA
## 2 None 0 TA TA CBlock TA
## 3 BrkFace 108 TA TA CBlock TA
## 4 None 0 Gd TA CBlock TA
## 5 None 0 TA TA PConc Gd
## 6 BrkFace 20 TA TA PConc TA
## Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1 Gd Gd BLQ 639 Unf
## 2 TA No Rec 468 LwQ
## 3 TA No ALQ 923 Unf
## 4 TA No ALQ 1065 Unf
## 5 TA No GLQ 791 Unf
## 6 TA No GLQ 602 Unf
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1 0 441 1080 GasA Fa Y
## 2 144 270 882 GasA TA Y
## 3 0 406 1329 GasA TA Y
## 4 0 1045 2110 GasA Ex Y
## 5 0 137 928 GasA Gd Y
## 6 0 324 926 GasA Ex Y
## Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area Bsmt.Full.Bath
## 1 SBrkr 1656 0 0 1656 1
## 2 SBrkr 896 0 0 896 0
## 3 SBrkr 1329 0 0 1329 0
## 4 SBrkr 2110 0 0 2110 1
## 5 SBrkr 928 701 0 1629 0
## 6 SBrkr 926 678 0 1604 0
## Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual
## 1 0 1 0 3 1 TA
## 2 0 1 0 2 1 TA
## 3 0 1 1 3 1 Gd
## 4 0 2 1 3 1 Ex
## 5 0 2 1 3 1 TA
## 6 0 2 1 3 1 Gd
## TotRms.AbvGrd Functional Fireplaces Fireplace.Qu Garage.Type Garage.Yr.Blt
## 1 7 Typ 2 Gd Attchd 1960
## 2 5 Typ 0 <NA> Attchd 1961
## 3 6 Typ 0 <NA> Attchd 1958
## 4 8 Typ 2 TA Attchd 1968
## 5 6 Typ 1 TA Attchd 1997
## 6 7 Typ 1 Gd Attchd 1998
## Garage.Finish Garage.Cars Garage.Area Garage.Qual Garage.Cond Paved.Drive
## 1 Fin 2 528 TA TA P
## 2 Unf 1 730 TA TA Y
## 3 Unf 1 312 TA TA Y
## 4 Fin 2 522 TA TA Y
## 5 Fin 2 482 TA TA Y
## 6 Fin 2 470 TA TA Y
## Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area
## 1 210 62 0 0 0 0
## 2 140 0 0 0 120 0
## 3 393 36 0 0 0 0
## 4 0 0 0 0 0 0
## 5 212 34 0 0 0 0
## 6 360 36 0 0 0 0
## Pool.QC Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1 <NA> <NA> <NA> 0 5 2010 WD Normal
## 2 <NA> MnPrv <NA> 0 6 2010 WD Normal
## 3 <NA> <NA> Gar2 12500 6 2010 WD Normal
## 4 <NA> <NA> <NA> 0 4 2010 WD Normal
## 5 <NA> MnPrv <NA> 0 3 2010 WD Normal
## 6 <NA> <NA> <NA> 0 6 2010 WD Normal
## SalePrice
## 1 215000
## 2 105000
## 3 172000
## 4 244000
## 5 189900
## 6 195500
zoning_group <- ames %>% group_by(MS.Zoning) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)
zoning_group <- zoning_group %>% mutate(Probability_Tag = ifelse(Count <= 2, 'Low Probability Group', 'Normal Group'))
zoning_group
## # A tibble: 7 × 4
## MS.Zoning Avg_SalePrice Count Probability_Tag
## <chr> <dbl> <int> <chr>
## 1 A (agr) 47300 2 Low Probability Group
## 2 I (all) 80312. 2 Low Probability Group
## 3 C (all) 79795. 25 Normal Group
## 4 RH 136420. 27 Normal Group
## 5 FV 218987. 139 Normal Group
## 6 RM 126781. 462 Normal Group
## 7 RL 191283. 2273 Normal Group
neighborhood_group <- ames %>% group_by(Neighborhood) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)
neighborhood_group <- neighborhood_group %>% mutate(Probability_Tag = ifelse(Count <= 2, 'Low Probability Group', 'Normal Group'))
neighborhood_group
## # A tibble: 28 × 4
## Neighborhood Avg_SalePrice Count Probability_Tag
## <chr> <dbl> <int> <chr>
## 1 Landmrk 137000 1 Low Probability Group
## 2 GrnHill 280000 2 Low Probability Group
## 3 Greens 193531. 8 Normal Group
## 4 Blueste 143590 10 Normal Group
## 5 NPkVill 140711. 23 Normal Group
## 6 Veenker 248315. 24 Normal Group
## 7 Blmngtn 196662. 28 Normal Group
## 8 BrDale 105608. 30 Normal Group
## 9 MeadowV 95756. 37 Normal Group
## 10 ClearCr 208662. 44 Normal Group
## # ℹ 18 more rows
house_style_group <- ames %>% group_by(House.Style) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)
house_style_group <- house_style_group %>% mutate(Probability_Tag = ifelse(Count <= 8, 'Low Probability Group', 'Normal Group'))
house_style_group
## # A tibble: 8 × 4
## House.Style Avg_SalePrice Count Probability_Tag
## <chr> <dbl> <int> <chr>
## 1 2.5Fin 220000 8 Low Probability Group
## 2 1.5Unf 109663. 19 Normal Group
## 3 2.5Unf 177158. 24 Normal Group
## 4 SFoyer 143473. 83 Normal Group
## 5 SLvl 165527. 128 Normal Group
## 6 1.5Fin 137530. 314 Normal Group
## 7 2Story 206990. 873 Normal Group
## 8 1Story 178700. 1481 Normal Group
ggplot(zoning_group, aes(x = MS.Zoning, y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + labs(title = 'Average Sale Price by MS.Zoning', x = 'Zoning Classification', y = 'Average Sale Price') + theme_minimal()
ggplot(neighborhood_group, aes(x = reorder(Neighborhood, -Avg_SalePrice), y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + coord_flip() + labs(title = 'Average Sale Price by Neighborhood', x = 'Neighborhood', y = 'Average Sale Price') + theme_minimal()
ggplot(house_style_group, aes(x = House.Style, y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + labs(title = 'Average Sale Price by House Style', x = 'House Style', y = 'Average Sale Price') + theme_minimal()
combo_group <- ames %>% group_by(MS.Zoning, Neighborhood) %>% summarise(Count = n()) %>% spread(MS.Zoning, Count, fill = 0)
## `summarise()` has grouped output by 'MS.Zoning'. You can override using the
## `.groups` argument.
missing_combinations <- combo_group %>% filter(if_all(everything(), ~ . == 0))
missing_combinations
## # A tibble: 0 × 8
## # ℹ 8 variables: Neighborhood <chr>, A (agr) <dbl>, C (all) <dbl>, FV <dbl>,
## # I (all) <dbl>, RH <dbl>, RL <dbl>, RM <dbl>
most_common_combinations <- ames %>% group_by(MS.Zoning, Neighborhood) %>% summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'MS.Zoning'. You can override using the
## `.groups` argument.
ggplot(most_common_combinations, aes(x = Neighborhood, y = Count, fill = MS.Zoning)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip() + labs(title = 'Most Common Combinations of MS.Zoning and Neighborhood', x = 'Neighborhood', y = 'Count') + theme_minimal()