```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Load necessary libraries

library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
library(tidyr)

Load the dataset

ames <- read.csv('D:/Stats for DS/ames.csv', header = TRUE)

Display the first few rows of the data

head(ames)
##   Order       PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1     1 526301100          20        RL          141    31770   Pave  <NA>
## 2     2 526350040          20        RH           80    11622   Pave  <NA>
## 3     3 526351010          20        RL           81    14267   Pave  <NA>
## 4     4 526353030          20        RL           93    11160   Pave  <NA>
## 5     5 527105010          60        RL           74    13830   Pave  <NA>
## 6     6 527105030          60        RL           78     9978   Pave  <NA>
##   Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 2       Reg          Lvl    AllPub     Inside        Gtl        NAmes
## 3       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 4       Reg          Lvl    AllPub     Corner        Gtl        NAmes
## 5       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
## 6       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
##   Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1        Norm        Norm      1Fam      1Story            6            5
## 2       Feedr        Norm      1Fam      1Story            5            6
## 3        Norm        Norm      1Fam      1Story            6            6
## 4        Norm        Norm      1Fam      1Story            7            5
## 5        Norm        Norm      1Fam      2Story            5            5
## 6        Norm        Norm      1Fam      2Story            6            6
##   Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1       1960           1960        Hip   CompShg      BrkFace      Plywood
## 2       1961           1961      Gable   CompShg      VinylSd      VinylSd
## 3       1958           1958        Hip   CompShg      Wd Sdng      Wd Sdng
## 4       1968           1968        Hip   CompShg      BrkFace      BrkFace
## 5       1997           1998      Gable   CompShg      VinylSd      VinylSd
## 6       1998           1998      Gable   CompShg      VinylSd      VinylSd
##   Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1        Stone          112         TA         TA     CBlock        TA
## 2         None            0         TA         TA     CBlock        TA
## 3      BrkFace          108         TA         TA     CBlock        TA
## 4         None            0         Gd         TA     CBlock        TA
## 5         None            0         TA         TA      PConc        Gd
## 6      BrkFace           20         TA         TA      PConc        TA
##   Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1        Gd            Gd            BLQ          639            Unf
## 2        TA            No            Rec          468            LwQ
## 3        TA            No            ALQ          923            Unf
## 4        TA            No            ALQ         1065            Unf
## 5        TA            No            GLQ          791            Unf
## 6        TA            No            GLQ          602            Unf
##   BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1            0         441          1080    GasA         Fa           Y
## 2          144         270           882    GasA         TA           Y
## 3            0         406          1329    GasA         TA           Y
## 4            0        1045          2110    GasA         Ex           Y
## 5            0         137           928    GasA         Gd           Y
## 6            0         324           926    GasA         Ex           Y
##   Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area Bsmt.Full.Bath
## 1      SBrkr        1656           0               0        1656              1
## 2      SBrkr         896           0               0         896              0
## 3      SBrkr        1329           0               0        1329              0
## 4      SBrkr        2110           0               0        2110              1
## 5      SBrkr         928         701               0        1629              0
## 6      SBrkr         926         678               0        1604              0
##   Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual
## 1              0         1         0             3             1           TA
## 2              0         1         0             2             1           TA
## 3              0         1         1             3             1           Gd
## 4              0         2         1             3             1           Ex
## 5              0         2         1             3             1           TA
## 6              0         2         1             3             1           Gd
##   TotRms.AbvGrd Functional Fireplaces Fireplace.Qu Garage.Type Garage.Yr.Blt
## 1             7        Typ          2           Gd      Attchd          1960
## 2             5        Typ          0         <NA>      Attchd          1961
## 3             6        Typ          0         <NA>      Attchd          1958
## 4             8        Typ          2           TA      Attchd          1968
## 5             6        Typ          1           TA      Attchd          1997
## 6             7        Typ          1           Gd      Attchd          1998
##   Garage.Finish Garage.Cars Garage.Area Garage.Qual Garage.Cond Paved.Drive
## 1           Fin           2         528          TA          TA           P
## 2           Unf           1         730          TA          TA           Y
## 3           Unf           1         312          TA          TA           Y
## 4           Fin           2         522          TA          TA           Y
## 5           Fin           2         482          TA          TA           Y
## 6           Fin           2         470          TA          TA           Y
##   Wood.Deck.SF Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area
## 1          210            62              0           0            0         0
## 2          140             0              0           0          120         0
## 3          393            36              0           0            0         0
## 4            0             0              0           0            0         0
## 5          212            34              0           0            0         0
## 6          360            36              0           0            0         0
##   Pool.QC Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1    <NA>  <NA>         <NA>        0       5    2010       WD          Normal
## 2    <NA> MnPrv         <NA>        0       6    2010       WD          Normal
## 3    <NA>  <NA>         Gar2    12500       6    2010       WD          Normal
## 4    <NA>  <NA>         <NA>        0       4    2010       WD          Normal
## 5    <NA> MnPrv         <NA>        0       3    2010       WD          Normal
## 6    <NA>  <NA>         <NA>        0       6    2010       WD          Normal
##   SalePrice
## 1    215000
## 2    105000
## 3    172000
## 4    244000
## 5    189900
## 6    195500

Grouping the data by three different categorical columns

1. Group the data by MS.Zoning

zoning_group <- ames %>% group_by(MS.Zoning) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)

Tagging the smallest groups with a special tag

zoning_group <- zoning_group %>% mutate(Probability_Tag = ifelse(Count <= 2, 'Low Probability Group', 'Normal Group'))

Display the grouped data for MS.Zoning

zoning_group
## # A tibble: 7 × 4
##   MS.Zoning Avg_SalePrice Count Probability_Tag      
##   <chr>             <dbl> <int> <chr>                
## 1 A (agr)          47300      2 Low Probability Group
## 2 I (all)          80312.     2 Low Probability Group
## 3 C (all)          79795.    25 Normal Group         
## 4 RH              136420.    27 Normal Group         
## 5 FV              218987.   139 Normal Group         
## 6 RM              126781.   462 Normal Group         
## 7 RL              191283.  2273 Normal Group

Insight:

a. The smallest groups (e.g., zoning types with only 1 or 2 entries) have the lowest probability of being selected. These groups are tagged as low-probability groups.
b. Hypothesis: Some zoning types are less common due to zoning restrictions, geographical limitations, or neighborhood development patterns. This could be further tested by investigating zoning regulations in the Ames area.

2. Group the data by Neighborhood

neighborhood_group <- ames %>% group_by(Neighborhood) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)

Tagging the smallest groups with a special tag

neighborhood_group <- neighborhood_group %>% mutate(Probability_Tag = ifelse(Count <= 2, 'Low Probability Group', 'Normal Group'))

Display the grouped data for Neighborhood

neighborhood_group
## # A tibble: 28 × 4
##    Neighborhood Avg_SalePrice Count Probability_Tag      
##    <chr>                <dbl> <int> <chr>                
##  1 Landmrk            137000      1 Low Probability Group
##  2 GrnHill            280000      2 Low Probability Group
##  3 Greens             193531.     8 Normal Group         
##  4 Blueste            143590     10 Normal Group         
##  5 NPkVill            140711.    23 Normal Group         
##  6 Veenker            248315.    24 Normal Group         
##  7 Blmngtn            196662.    28 Normal Group         
##  8 BrDale             105608.    30 Normal Group         
##  9 MeadowV             95756.    37 Normal Group         
## 10 ClearCr            208662.    44 Normal Group         
## # ℹ 18 more rows

Insight:

a. Some neighborhoods (e.g., Veenker, with only 1 entry) are extremely rare and tagged as low-probability groups.
b. Hypothesis: The rarity of these neighborhoods could be due to their location being less desirable, or these neighborhoods might have strict building codes, leading to fewer houses being listed for sale.

3. Group the data by House.Style

house_style_group <- ames %>% group_by(House.Style) %>% summarise(Avg_SalePrice = mean(SalePrice, na.rm = TRUE), Count = n()) %>% arrange(Count)

Tagging the smallest groups with a special tag

house_style_group <- house_style_group %>% mutate(Probability_Tag = ifelse(Count <= 8, 'Low Probability Group', 'Normal Group'))

Display the grouped data for House.Style

house_style_group
## # A tibble: 8 × 4
##   House.Style Avg_SalePrice Count Probability_Tag      
##   <chr>               <dbl> <int> <chr>                
## 1 2.5Fin            220000      8 Low Probability Group
## 2 1.5Unf            109663.    19 Normal Group         
## 3 2.5Unf            177158.    24 Normal Group         
## 4 SFoyer            143473.    83 Normal Group         
## 5 SLvl              165527.   128 Normal Group         
## 6 1.5Fin            137530.   314 Normal Group         
## 7 2Story            206990.   873 Normal Group         
## 8 1Story            178700.  1481 Normal Group

Insight:

a. The Split Level (SLvl) house style is one of the rarest in the dataset, tagged as a low-probability group.

Visualizing the groupings

1. Average Sale Price by MS.Zoning

ggplot(zoning_group, aes(x = MS.Zoning, y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + labs(title = 'Average Sale Price by MS.Zoning', x = 'Zoning Classification', y = 'Average Sale Price') + theme_minimal()

2. Average Sale Price by Neighborhood

ggplot(neighborhood_group, aes(x = reorder(Neighborhood, -Avg_SalePrice), y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + coord_flip() + labs(title = 'Average Sale Price by Neighborhood', x = 'Neighborhood', y = 'Average Sale Price') + theme_minimal()

3. Average Sale Price by House Style

ggplot(house_style_group, aes(x = House.Style, y = Avg_SalePrice, fill = Probability_Tag)) + geom_bar(stat = 'identity') + labs(title = 'Average Sale Price by House Style', x = 'House Style', y = 'Average Sale Price') + theme_minimal()

Investigating Combinations of Categorical Variables

Group by MS.Zoning and Neighborhood, summarize the count

combo_group <- ames %>% group_by(MS.Zoning, Neighborhood) %>% summarise(Count = n()) %>% spread(MS.Zoning, Count, fill = 0)
## `summarise()` has grouped output by 'MS.Zoning'. You can override using the
## `.groups` argument.

Check for missing combinations (combinations with zero count)

missing_combinations <- combo_group %>% filter(if_all(everything(), ~ . == 0))

Display missing combinations

missing_combinations
## # A tibble: 0 × 8
## # ℹ 8 variables: Neighborhood <chr>, A (agr) <dbl>, C (all) <dbl>, FV <dbl>,
## #   I (all) <dbl>, RH <dbl>, RL <dbl>, RM <dbl>

Insight:

Missing combinations (if any) indicate that certain zoning types do not appear in specific neighborhoods. This could be due to zoning regulations preventing certain types of development in these areas.

Visualize the Most Common Combinations

Gather the data for visualization

most_common_combinations <- ames %>% group_by(MS.Zoning, Neighborhood) %>% summarise(Count = n()) %>% arrange(desc(Count))
## `summarise()` has grouped output by 'MS.Zoning'. You can override using the
## `.groups` argument.

Visualize the most common combinations

ggplot(most_common_combinations, aes(x = Neighborhood, y = Count, fill = MS.Zoning)) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip() + labs(title = 'Most Common Combinations of MS.Zoning and Neighborhood', x = 'Neighborhood', y = 'Count') + theme_minimal()

Conclusion:

This analysis provides a detailed look into the Ames Housing dataset. We identified low-probability groups across several categorical variables, hypothesized why some groups are smaller, and visualized the data accordingly. Additionally, we explored combinations of MS.Zoning and Neighborhood, uncovering some interesting patterns.