{r} install.packages("dplyr") # Install if you haven't already library(dplyr) # Load dplyr for piping and data manipulation ```{r} install.packages(“modeldata”) # Install if you haven’t already install.packages(“dplyr”) # Install dplyr if not installed

library(modeldata) # Load dataset package library(dplyr) # Load dplyr for data manipulation

data(“ames”) # Load the Ames housing dataset ames_train <- ames # Assign it to a variable


```{r}
count(ames_train, Neighborhood) %>% arrange(n)

{r} count(ames_train, Neighborhood) %>% arrange(n) ## # A tibble: 28 x 2 ## Neighborhood n ## <fct> <int> ## 1 Landmark 1 ## 2 Green_Hills 2 ## 3 Greens 7 ## 4 Blueste 9 ## 5 Northpark_Villa 17 ## 6 Briardale 18 ## 7 Veenker 20 ## 8 Bloomington_Heights 21 ## 9 South_and_West_of_Iowa_State_University 30 ## 10 Meadow_Village 30 ## # … with 18 more rows

{r} count(ames_train, Screen_Porch) %>% arrange(n) ## # A tibble: 93 x 2 ## Screen_Porch n ## <int> <int> ## 1 40 1 ## 2 80 1 ## 3 92 1 ## 4 94 1 ## 5 99 1 ## 6 104 1 ## 7 109 1 ## 8 110 1 ## 9 111 1 ## 10 117 1 ## # … with 83 more rows {r} install.packages("recipes") # Install the package if not already installed library(recipes) # Load the package

```{r} # Lump levels for two features lumping <- recipe(Sale_Price ~ ., data = ames_train) %>% step_other(Neighborhood, threshold = 0.01, other = “other”) %>% step_other(Screen_Porch, threshold = 0.1, other = “>0”)

Apply this blue print –> you will learn about this at

the end of the chapter

apply_2_training <- prep(lumping, training = ames_train) %>% bake(ames_train)

New distribution of Neighborhood

count(apply_2_training, Neighborhood) %>% arrange(n) ## # A tibble: 22 x 2 ## Neighborhood n ## ## 1 Bloomington_Heights 21 ## 2 South_and_West_of_Iowa_State_University 30 ## 3 Meadow_Village 30 ## 4 Clear_Creek 31 ## 5 Stone_Brook 34 ## 6 Northridge 48 ## 7 Timberland 55 ## 8 Iowa_DOT_and_Rail_Road 62 ## 9 Crawford 72 ## 10 Mitchell 74 ## # … with 12 more rows

New distribution of Screen_Porch

count(apply_2_training, Screen_Porch) %>% arrange(n) ## # A tibble: 2 x 2 ## Screen_Porch n ## ## 1 >0 174 ## 2 0 1879


```{r}
# Lump levels for two features
recipe(Sale_Price ~ ., data = ames_train) %>%
  step_dummy(all_nominal(), one_hot = TRUE)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         80
## 
## Operations:
## 
## Dummy variables from all_nominal

```{r} # Original categories count(ames_train, MS_SubClass) ## # A tibble: 16 x 2 ## MS_SubClass n ## ## 1 One_Story_1946_and_Newer_All_Styles 749 ## 2 One_Story_1945_and_Older 93 ## 3 One_Story_with_Finished_Attic_All_Ages 5 ## 4 One_and_Half_Story_Unfinished_All_Ages 11 ## 5 One_and_Half_Story_Finished_All_Ages 207 ## 6 Two_Story_1946_and_Newer 394 ## 7 Two_Story_1945_and_Older 98 ## 8 Two_and_Half_Story_All_Ages 17 ## 9 Split_or_Multilevel 78 ## 10 Split_Foyer 31 ## 11 Duplex_All_Styles_and_Ages 69 ## 12 One_Story_PUD_1946_and_Newer 144 ## 13 One_and_Half_Story_PUD_All_Ages 1 ## 14 Two_Story_PUD_1946_and_Newer 98 ## 15 PUD_Multilevel_Split_Level_Foyer 14 ## 16 Two_Family_conversion_All_Styles_and_Ages 44

Label encoded

recipe(Sale_Price ~ ., data = ames_train) %>% step_integer(MS_SubClass) %>% prep(ames_train) %>% bake(ames_train) %>% count(MS_SubClass) ## # A tibble: 16 x 2 ## MS_SubClass n ## ## 1 1 749 ## 2 2 93 ## 3 3 5 ## 4 4 11 ## 5 5 207 ## 6 6 394 ## 7 7 98 ## 8 8 17 ## 9 9 78 ## 10 10 31 ## 11 11 69 ## 12 12 144 ## 13 13 1 ## 14 14 98 ## 15 15 14 ## 16 16 44


```{r}
ames_train %>% select(contains("Qual"))
## # A tibble: 2,053 x 6
##    Overall_Qual Exter_Qual Bsmt_Qual Low_Qual_Fin_SF Kitchen_Qual
##    <fct>        <fct>      <fct>               <int> <fct>       
##  1 Above_Avera… Typical    Typical                 0 Typical     
##  2 Average      Typical    Typical                 0 Typical     
##  3 Above_Avera… Typical    Typical                 0 Good        
##  4 Above_Avera… Typical    Typical                 0 Good        
##  5 Very_Good    Good       Good                    0 Good        
##  6 Very_Good    Good       Good                    0 Good        
##  7 Good         Typical    Typical                 0 Good        
##  8 Above_Avera… Typical    Good                    0 Typical     
##  9 Above_Avera… Typical    Good                    0 Typical     
## 10 Good         Typical    Good                    0 Good        
## # … with 2,043 more rows, and 1 more variable: Garage_Qual <fct>

```{r} # Original categories count(ames_train, Overall_Qual) ## # A tibble: 10 x 2 ## Overall_Qual n ## ## 1 Very_Poor 4 ## 2 Poor 9 ## 3 Fair 27 ## 4 Below_Average 166 ## 5 Average 565 ## 6 Above_Average 513 ## 7 Good 438 ## 8 Very_Good 231 ## 9 Excellent 77 ## 10 Very_Excellent 23

Label encoded

recipe(Sale_Price ~ ., data = ames_train) %>% step_integer(Overall_Qual) %>% prep(ames_train) %>% bake(ames_train) %>% count(Overall_Qual) ## # A tibble: 10 x 2 ## Overall_Qual n ## ## 1 1 4 ## 2 2 9 ## 3 3 27 ## 4 4 166 ## 5 5 565 ## 6 6 513 ## 7 7 438 ## 8 8 231 ## 9 9 77 ## 10 10 23 ```