{r} install.packages("dplyr") # Install if you haven't already library(dplyr) # Load dplyr for piping and data manipulation
```{r} install.packages(“modeldata”) # Install if you haven’t already
install.packages(“dplyr”) # Install dplyr if not installed
library(modeldata) # Load dataset package library(dplyr) # Load dplyr for data manipulation
data(“ames”) # Load the Ames housing dataset ames_train <- ames # Assign it to a variable
```{r}
count(ames_train, Neighborhood) %>% arrange(n)
{r} count(ames_train, Neighborhood) %>% arrange(n) ## # A tibble: 28 x 2 ## Neighborhood n ## <fct> <int> ## 1 Landmark 1 ## 2 Green_Hills 2 ## 3 Greens 7 ## 4 Blueste 9 ## 5 Northpark_Villa 17 ## 6 Briardale 18 ## 7 Veenker 20 ## 8 Bloomington_Heights 21 ## 9 South_and_West_of_Iowa_State_University 30 ## 10 Meadow_Village 30 ## # … with 18 more rows
{r} count(ames_train, Screen_Porch) %>% arrange(n) ## # A tibble: 93 x 2 ## Screen_Porch n ## <int> <int> ## 1 40 1 ## 2 80 1 ## 3 92 1 ## 4 94 1 ## 5 99 1 ## 6 104 1 ## 7 109 1 ## 8 110 1 ## 9 111 1 ## 10 117 1 ## # … with 83 more rows
{r} install.packages("recipes") # Install the package if not already installed library(recipes) # Load the package
```{r} # Lump levels for two features lumping <- recipe(Sale_Price ~ ., data = ames_train) %>% step_other(Neighborhood, threshold = 0.01, other = “other”) %>% step_other(Screen_Porch, threshold = 0.1, other = “>0”)
apply_2_training <- prep(lumping, training = ames_train) %>% bake(ames_train)
count(apply_2_training, Neighborhood) %>% arrange(n) ## # A
tibble: 22 x 2 ## Neighborhood n ##
count(apply_2_training, Screen_Porch) %>% arrange(n) ## # A
tibble: 2 x 2 ## Screen_Porch n ##
```{r}
# Lump levels for two features
recipe(Sale_Price ~ ., data = ames_train) %>%
step_dummy(all_nominal(), one_hot = TRUE)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Operations:
##
## Dummy variables from all_nominal
```{r} # Original categories count(ames_train, MS_SubClass) ## # A
tibble: 16 x 2 ## MS_SubClass n ##
recipe(Sale_Price ~ ., data = ames_train) %>%
step_integer(MS_SubClass) %>% prep(ames_train) %>%
bake(ames_train) %>% count(MS_SubClass) ## # A tibble: 16 x 2 ##
MS_SubClass n ##
```{r}
ames_train %>% select(contains("Qual"))
## # A tibble: 2,053 x 6
## Overall_Qual Exter_Qual Bsmt_Qual Low_Qual_Fin_SF Kitchen_Qual
## <fct> <fct> <fct> <int> <fct>
## 1 Above_Avera… Typical Typical 0 Typical
## 2 Average Typical Typical 0 Typical
## 3 Above_Avera… Typical Typical 0 Good
## 4 Above_Avera… Typical Typical 0 Good
## 5 Very_Good Good Good 0 Good
## 6 Very_Good Good Good 0 Good
## 7 Good Typical Typical 0 Good
## 8 Above_Avera… Typical Good 0 Typical
## 9 Above_Avera… Typical Good 0 Typical
## 10 Good Typical Good 0 Good
## # … with 2,043 more rows, and 1 more variable: Garage_Qual <fct>
```{r} # Original categories count(ames_train, Overall_Qual) ## # A
tibble: 10 x 2 ## Overall_Qual n ##
recipe(Sale_Price ~ ., data = ames_train) %>%
step_integer(Overall_Qual) %>% prep(ames_train) %>%
bake(ames_train) %>% count(Overall_Qual) ## # A tibble: 10 x 2 ##
Overall_Qual n ##