Homework 5 - dplyr

This homework explores the 5 dplyr verbs as well as some ggplot skills.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

When in doubt: cheatsheet

Part 1 - Babynames

We’re going to use the built-in data babynames. You can use ? babynames for more information on what the variables are (…aka, get the metadata!)

#install.packages("babynames") #once per machine
library(babynames) # every time

## Warning: package 'babynames' was built under R version 4.3.3

#write.csv(babynames, "babynames.csv", row.names = FALSE)

glimpse(babynames)

## Rows: 1,924,665
## Columns: 5
## $ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880,…
## $ sex  <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Ida",…
## $ n    <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 1258,…
## $ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.016…

Use select() to choose 3 columns, year, name and n.
Use filter() to subset to only one name of your choice.
Do both of those things (pipe them!) and assign the result to a new object.
Check the dimensions of that object, view the first few rows, and verify what kind of object it is.
Make a plot of the popularity of your chosen name over time (you’re going to want a line graph where the year is the x axis and the popularity is the y axis)
Use mutate() to create a new variable (column) popular that is true if the name was given to more than 1% of babies in that year.
Use rename() to rename the variable popular.
Filter your data to only the popular names
Arrange the data from most to least popular to find the most popular name of all time. (Going the other way doesn’t work, the data is limited to names used 5 times or more)
Choose a name, and find the year in which that name was used most frequently.
Choose a year, and find the name that was most common that year.
Go back to your filtered data from above, and use summarize() to find the most times that name was given to a child in any given year (find the maximium value of n).

babynames %>% select(year, name, n)

## # A tibble: 1,924,665 × 3
##     year name          n
##    <dbl> <chr>     <int>
##  1  1880 Mary       7065
##  2  1880 Anna       2604
##  3  1880 Emma       2003
##  4  1880 Elizabeth  1939
##  5  1880 Minnie     1746
##  6  1880 Margaret   1578
##  7  1880 Ida        1472
##  8  1880 Alice      1414
##  9  1880 Bertha     1320
## 10  1880 Sarah      1288
## # ℹ 1,924,655 more rows

babynames %>% filter(name =="Tony")

## # A tibble: 240 × 5
##     year sex   name      n      prop
##    <dbl> <chr> <chr> <int>     <dbl>
##  1  1880 M     Tony     42 0.000355 
##  2  1881 M     Tony     36 0.000332 
##  3  1882 M     Tony     46 0.000377 
##  4  1883 M     Tony     32 0.000284 
##  5  1884 M     Tony     41 0.000334 
##  6  1885 M     Tony     33 0.000285 
##  7  1886 M     Tony     29 0.000244 
##  8  1887 M     Tony     40 0.000366 
##  9  1888 M     Tony     57 0.000439 
## 10  1889 F     Tony      6 0.0000317
## # ℹ 230 more rows

tony <- babynames %>% select(year, name, n) %>% filter(name == "Tony")
tony

## # A tibble: 240 × 3
##     year name      n
##    <dbl> <chr> <int>
##  1  1880 Tony     42
##  2  1881 Tony     36
##  3  1882 Tony     46
##  4  1883 Tony     32
##  5  1884 Tony     41
##  6  1885 Tony     33
##  7  1886 Tony     29
##  8  1887 Tony     40
##  9  1888 Tony     57
## 10  1889 Tony      6
## # ℹ 230 more rows

dim(tony)

## [1] 240   3

tony %>% head(3)

## # A tibble: 3 × 3
##    year name      n
##   <dbl> <chr> <int>
## 1  1880 Tony     42
## 2  1881 Tony     36
## 3  1882 Tony     46

typeof(tony)

## [1] "list"

# plot
annie <- babynames %>% filter(name == "Annie") %>% group_by(year, name) %>% mutate(total = sum(prop))

ggplot(data = annie, aes(x=year, y = total )) + geom_point() + geom_smooth(method = "auto", se = TRUE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

baby_addpopular <- babynames %>% mutate(popular = ifelse(is.na(prop) | is.null(prop), FALSE, ifelse(prop >= 0.01, TRUE, FALSE)))

#baby_addpopular %>% group_by(year, popular) %>% summarise(count = n())

baby_rename <- rename(baby_addpopular, isPopular = popular)

popular <- baby_rename %>% filter(isPopular == TRUE)

popular %>% arrange(desc(prop))

## # A tibble: 3,878 × 6
##     year sex   name        n   prop isPopular
##    <dbl> <chr> <chr>   <int>  <dbl> <lgl>    
##  1  1880 M     John     9655 0.0815 TRUE     
##  2  1881 M     John     8769 0.0810 TRUE     
##  3  1880 M     William  9532 0.0805 TRUE     
##  4  1883 M     John     8894 0.0791 TRUE     
##  5  1881 M     William  8524 0.0787 TRUE     
##  6  1882 M     John     9557 0.0783 TRUE     
##  7  1884 M     John     9388 0.0765 TRUE     
##  8  1882 M     William  9298 0.0762 TRUE     
##  9  1886 M     John     9026 0.0758 TRUE     
## 10  1885 M     John     8756 0.0755 TRUE     
## # ℹ 3,868 more rows

find_most_frequent_year <- function(input_name){
  
  filter_by_name <- babynames %>% filter(name == input_name)
  
  if(nrow(filter_by_name) ==0){
    message(paste("The name "), input_name," does not exist in dataset", sep=" ")
    return (NULL)
  }
  
  most_frequent_year <- filter_by_name %>% 
    filter(prop == max(prop)) %>%
    select(year) %>%
    unique()
  
  return(most_frequent_year)
  
}

find_most_frequent_year("aa")

## The name aa does not exist in dataset

## NULL

find_most_frequent_year("John")

## # A tibble: 1 × 1
##    year
##   <dbl>
## 1  1880

find_most_common_name <- function(input_year){
  
  filter_by_year <- babynames %>% filter(year == input_year)
  
  if(nrow(filter_by_year) ==0){
    message(paste("The year "), input_year," does not exist in dataset", sep=" ")
    return (NULL)
  }
  
  most_common_name <- filter_by_year %>% 
    filter(prop == max(prop)) %>%
    select(name) %>%
    unique()
  
  return(most_common_name)
  
}

find_most_common_name(1980)

## # A tibble: 1 × 1
##   name   
##   <chr>  
## 1 Michael

find_most_common_name(19801)

## The year 19801 does not exist in dataset

## NULL

most_time_given <- babynames %>% filter(name =="Michael") %>% summarise(most_time_given = max(n))
most_time_given

## # A tibble: 1 × 1
##   most_time_given
##             <int>
## 1           92695

To talk about the difference between the column name n and the function n() (that only works inside the summarize function), consider the below summarize, by adding in your name object. What does the value of num_rows represent (in real-world terms) in the previous result? Explain it to your future manager.
Answer: Column name ‘n’ in context dataframe represents a variable or column contain numeric number. It can be any kind depend on dataset. On the other hand, the function ‘n()’ in packagage dplyr to calculate the number of rows in a group. If having group_by it will calculate the occurance by group. The ‘num_rows’ represents the total number of observation of specific name in the dataset.

babynames %>%
  summarize(num_rows = n())

## # A tibble: 1 × 1
##   num_rows
##      <int>
## 1  1924665

Part 2 - Houses

Load the data in using the following command:

library(dplyr)
original_data <- read.csv("https://raw.githubusercontent.com/kshoemaker/Class_test/master/housedata.csv")
# renames a few variables for clarity
original_data <- original_data %>% select(TotalSqFeet = GrLivArea, PorchSqFeet = ScreenPorch, everything()) 

oldest_house_age <- original_data %>%
  summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age

##   oldest_house_age
## 1              152

How old is the oldest house?
How big is the biggest lot?
How much did the oldest house sell for?

library(dplyr)
library(ggplot2)

oldest_house_age <- original_data %>%
  summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age

##   oldest_house_age
## 1              152

biggest_lot <- original_data %>%
  summarise(biggest_lot = as.numeric(max(LotArea)))
biggest_lot

##   biggest_lot
## 1      215245

oldest_house_price <- original_data %>%
  arrange((YearBuilt))%>%
  slice(1) %>%
  pull(SalePrice)
oldest_house_price

## [1] 122000

What is the average total square footage and minimum lot area for each building type?
How many of the houses have central air?
Make a scatterplot with TotalSqFeet on the x-axis and SalePrice on the y-axis, color the points by BldgType.
Make a ggplot histogram of the porch square footage of all the houses.
Create a new variable has_porch in your dataframe that indicates whether or not the house has a screened porch, something like T/F or “yes”/“no”. How many houses have screened porches? How many don’t?
Redo your histogram of the porch square footage, but now filter the data to only houses with screened porches.

#  the average total square footage and minimum lot area for each building type
library(ggplot2)
avg_sqf <- original_data %>% group_by(BldgType) %>% 
  summarise(avg_sqf = mean(TotalSqFeet), minimum_lot = min(LotArea))
avg_sqf

## # A tibble: 5 × 3
##   BldgType avg_sqf minimum_lot
##   <chr>      <dbl>       <int>
## 1 1Fam       1539.        2500
## 2 2fmCon     1561.        4456
## 3 Duplex     1567.        6040
## 4 Twnhs      1276.        1526
## 5 TwnhsE     1312.        1300

num_central_air <- original_data %>% filter(CentralAir =="Y") %>% nrow()
num_central_air

## [1] 1365

ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)

# bonus create separate graph by building type
ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)+ facet_wrap(~ BldgType, scales = "free")

# histogram porch square footage of all houses

ggplot(data = original_data, aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "black") + labs(x = "Porch SQF", y = "Frequency")

data1 <- original_data %>% mutate(has_porch = ifelse(OpenPorchSF >0,"T","F"))
num_hasScreenPorch <- data1 %>% filter(has_porch == "T") %>% nrow()
num_hasScreenPorch

## [1] 804

num_noScreenPorch <- data1 %>% filter(has_porch == "F") %>% nrow()
num_noScreenPorch

## [1] 656

data1 %>% filter(has_porch =="T") %>% ggplot(aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "red") + labs(x = "Porch SQF--", y = "Frequency")

Homework 5 - dplyr

Tan Nguyen

DATA 2401

Part 1 - Babynames

Part 2 - Houses