This homework explores the 5 dplyr verbs as well as some ggplot skills.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

When in doubt: cheatsheet

Part 1 - Babynames

We’re going to use the built-in data babynames. You can use ? babynames for more information on what the variables are (…aka, get the metadata!)

#install.packages("babynames") #once per machine
library(babynames) # every time
## Warning: package 'babynames' was built under R version 4.3.3
#write.csv(babynames, "babynames.csv", row.names = FALSE)

glimpse(babynames)
## Rows: 1,924,665
## Columns: 5
## $ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880,…
## $ sex  <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Ida",…
## $ n    <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 1258,…
## $ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.016…
babynames %>% select(year, name, n)
## # A tibble: 1,924,665 × 3
##     year name          n
##    <dbl> <chr>     <int>
##  1  1880 Mary       7065
##  2  1880 Anna       2604
##  3  1880 Emma       2003
##  4  1880 Elizabeth  1939
##  5  1880 Minnie     1746
##  6  1880 Margaret   1578
##  7  1880 Ida        1472
##  8  1880 Alice      1414
##  9  1880 Bertha     1320
## 10  1880 Sarah      1288
## # ℹ 1,924,655 more rows
babynames %>% filter(name =="Tony")
## # A tibble: 240 × 5
##     year sex   name      n      prop
##    <dbl> <chr> <chr> <int>     <dbl>
##  1  1880 M     Tony     42 0.000355 
##  2  1881 M     Tony     36 0.000332 
##  3  1882 M     Tony     46 0.000377 
##  4  1883 M     Tony     32 0.000284 
##  5  1884 M     Tony     41 0.000334 
##  6  1885 M     Tony     33 0.000285 
##  7  1886 M     Tony     29 0.000244 
##  8  1887 M     Tony     40 0.000366 
##  9  1888 M     Tony     57 0.000439 
## 10  1889 F     Tony      6 0.0000317
## # ℹ 230 more rows
tony <- babynames %>% select(year, name, n) %>% filter(name == "Tony")
tony
## # A tibble: 240 × 3
##     year name      n
##    <dbl> <chr> <int>
##  1  1880 Tony     42
##  2  1881 Tony     36
##  3  1882 Tony     46
##  4  1883 Tony     32
##  5  1884 Tony     41
##  6  1885 Tony     33
##  7  1886 Tony     29
##  8  1887 Tony     40
##  9  1888 Tony     57
## 10  1889 Tony      6
## # ℹ 230 more rows
dim(tony)
## [1] 240   3
tony %>% head(3)
## # A tibble: 3 × 3
##    year name      n
##   <dbl> <chr> <int>
## 1  1880 Tony     42
## 2  1881 Tony     36
## 3  1882 Tony     46
typeof(tony)
## [1] "list"
# plot
annie <- babynames %>% filter(name == "Annie") %>% group_by(year, name) %>% mutate(total = sum(prop))

ggplot(data = annie, aes(x=year, y = total )) + geom_point() + geom_smooth(method = "auto", se = TRUE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

baby_addpopular <- babynames %>% mutate(popular = ifelse(is.na(prop) | is.null(prop), FALSE, ifelse(prop >= 0.01, TRUE, FALSE)))

#baby_addpopular %>% group_by(year, popular) %>% summarise(count = n())

baby_rename <- rename(baby_addpopular, isPopular = popular)

popular <- baby_rename %>% filter(isPopular == TRUE)

popular %>% arrange(desc(prop))
## # A tibble: 3,878 × 6
##     year sex   name        n   prop isPopular
##    <dbl> <chr> <chr>   <int>  <dbl> <lgl>    
##  1  1880 M     John     9655 0.0815 TRUE     
##  2  1881 M     John     8769 0.0810 TRUE     
##  3  1880 M     William  9532 0.0805 TRUE     
##  4  1883 M     John     8894 0.0791 TRUE     
##  5  1881 M     William  8524 0.0787 TRUE     
##  6  1882 M     John     9557 0.0783 TRUE     
##  7  1884 M     John     9388 0.0765 TRUE     
##  8  1882 M     William  9298 0.0762 TRUE     
##  9  1886 M     John     9026 0.0758 TRUE     
## 10  1885 M     John     8756 0.0755 TRUE     
## # ℹ 3,868 more rows
find_most_frequent_year <- function(input_name){
  
  filter_by_name <- babynames %>% filter(name == input_name)
  
  if(nrow(filter_by_name) ==0){
    message(paste("The name "), input_name," does not exist in dataset", sep=" ")
    return (NULL)
  }
  
  most_frequent_year <- filter_by_name %>% 
    filter(prop == max(prop)) %>%
    select(year) %>%
    unique()
  
  return(most_frequent_year)
  
}

find_most_frequent_year("aa")
## The name aa does not exist in dataset
## NULL
find_most_frequent_year("John")
## # A tibble: 1 × 1
##    year
##   <dbl>
## 1  1880
find_most_common_name <- function(input_year){
  
  filter_by_year <- babynames %>% filter(year == input_year)
  
  if(nrow(filter_by_year) ==0){
    message(paste("The year "), input_year," does not exist in dataset", sep=" ")
    return (NULL)
  }
  
  most_common_name <- filter_by_year %>% 
    filter(prop == max(prop)) %>%
    select(name) %>%
    unique()
  
  return(most_common_name)
  
}

find_most_common_name(1980)
## # A tibble: 1 × 1
##   name   
##   <chr>  
## 1 Michael
find_most_common_name(19801)
## The year 19801 does not exist in dataset
## NULL
most_time_given <- babynames %>% filter(name =="Michael") %>% summarise(most_time_given = max(n))
most_time_given
## # A tibble: 1 × 1
##   most_time_given
##             <int>
## 1           92695
babynames %>%
  summarize(num_rows = n())
## # A tibble: 1 × 1
##   num_rows
##      <int>
## 1  1924665

Part 2 - Houses

Load the data in using the following command:

library(dplyr)
original_data <- read.csv("https://raw.githubusercontent.com/kshoemaker/Class_test/master/housedata.csv")
# renames a few variables for clarity
original_data <- original_data %>% select(TotalSqFeet = GrLivArea, PorchSqFeet = ScreenPorch, everything()) 

oldest_house_age <- original_data %>%
  summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age
##   oldest_house_age
## 1              152
library(dplyr)
library(ggplot2)

oldest_house_age <- original_data %>%
  summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age
##   oldest_house_age
## 1              152
biggest_lot <- original_data %>%
  summarise(biggest_lot = as.numeric(max(LotArea)))
biggest_lot
##   biggest_lot
## 1      215245
oldest_house_price <- original_data %>%
  arrange((YearBuilt))%>%
  slice(1) %>%
  pull(SalePrice)
oldest_house_price
## [1] 122000
#  the average total square footage and minimum lot area for each building type
library(ggplot2)
avg_sqf <- original_data %>% group_by(BldgType) %>% 
  summarise(avg_sqf = mean(TotalSqFeet), minimum_lot = min(LotArea))
avg_sqf
## # A tibble: 5 × 3
##   BldgType avg_sqf minimum_lot
##   <chr>      <dbl>       <int>
## 1 1Fam       1539.        2500
## 2 2fmCon     1561.        4456
## 3 Duplex     1567.        6040
## 4 Twnhs      1276.        1526
## 5 TwnhsE     1312.        1300
num_central_air <- original_data %>% filter(CentralAir =="Y") %>% nrow()
num_central_air
## [1] 1365
ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)

# bonus create separate graph by building type
ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)+ facet_wrap(~ BldgType, scales = "free")

# histogram porch square footage of all houses

ggplot(data = original_data, aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "black") + labs(x = "Porch SQF", y = "Frequency")

data1 <- original_data %>% mutate(has_porch = ifelse(OpenPorchSF >0,"T","F"))
num_hasScreenPorch <- data1 %>% filter(has_porch == "T") %>% nrow()
num_hasScreenPorch
## [1] 804
num_noScreenPorch <- data1 %>% filter(has_porch == "F") %>% nrow()
num_noScreenPorch
## [1] 656
data1 %>% filter(has_porch =="T") %>% ggplot(aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "red") + labs(x = "Porch SQF--", y = "Frequency")