This homework explores the 5 dplyr verbs as well as some
ggplot skills.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
When in doubt: cheatsheet
We’re going to use the built-in data babynames. You can
use ? babynames for more information on what the variables
are (…aka, get the metadata!)
#install.packages("babynames") #once per machine
library(babynames) # every time
## Warning: package 'babynames' was built under R version 4.3.3
#write.csv(babynames, "babynames.csv", row.names = FALSE)
glimpse(babynames)
## Rows: 1,924,665
## Columns: 5
## $ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880,…
## $ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", …
## $ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Ida",…
## $ n <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 1258,…
## $ prop <dbl> 0.07238359, 0.02667896, 0.02052149, 0.01986579, 0.01788843, 0.016…
Use select() to choose 3 columns, year, name and
n.
Use filter() to subset to only one name of your
choice.
Do both of those things (pipe them!) and assign the result to a new object.
Check the dimensions of that object, view the first few rows, and verify what kind of object it is.
Make a plot of the popularity of your chosen name over time (you’re going to want a line graph where the year is the x axis and the popularity is the y axis)
Use mutate() to create a new variable (column)
popular that is true if the name was given to more than 1%
of babies in that year.
Use rename() to rename the variable
popular.
Filter your data to only the popular names
Arrange the data from most to least popular to find the most popular name of all time. (Going the other way doesn’t work, the data is limited to names used 5 times or more)
Choose a name, and find the year in which that name was used most frequently.
Choose a year, and find the name that was most common that year.
Go back to your filtered data from above, and use
summarize() to find the most times that name was given to a
child in any given year (find the maximium value of n).
babynames %>% select(year, name, n)
## # A tibble: 1,924,665 × 3
## year name n
## <dbl> <chr> <int>
## 1 1880 Mary 7065
## 2 1880 Anna 2604
## 3 1880 Emma 2003
## 4 1880 Elizabeth 1939
## 5 1880 Minnie 1746
## 6 1880 Margaret 1578
## 7 1880 Ida 1472
## 8 1880 Alice 1414
## 9 1880 Bertha 1320
## 10 1880 Sarah 1288
## # ℹ 1,924,655 more rows
babynames %>% filter(name =="Tony")
## # A tibble: 240 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 M Tony 42 0.000355
## 2 1881 M Tony 36 0.000332
## 3 1882 M Tony 46 0.000377
## 4 1883 M Tony 32 0.000284
## 5 1884 M Tony 41 0.000334
## 6 1885 M Tony 33 0.000285
## 7 1886 M Tony 29 0.000244
## 8 1887 M Tony 40 0.000366
## 9 1888 M Tony 57 0.000439
## 10 1889 F Tony 6 0.0000317
## # ℹ 230 more rows
tony <- babynames %>% select(year, name, n) %>% filter(name == "Tony")
tony
## # A tibble: 240 × 3
## year name n
## <dbl> <chr> <int>
## 1 1880 Tony 42
## 2 1881 Tony 36
## 3 1882 Tony 46
## 4 1883 Tony 32
## 5 1884 Tony 41
## 6 1885 Tony 33
## 7 1886 Tony 29
## 8 1887 Tony 40
## 9 1888 Tony 57
## 10 1889 Tony 6
## # ℹ 230 more rows
dim(tony)
## [1] 240 3
tony %>% head(3)
## # A tibble: 3 × 3
## year name n
## <dbl> <chr> <int>
## 1 1880 Tony 42
## 2 1881 Tony 36
## 3 1882 Tony 46
typeof(tony)
## [1] "list"
# plot
annie <- babynames %>% filter(name == "Annie") %>% group_by(year, name) %>% mutate(total = sum(prop))
ggplot(data = annie, aes(x=year, y = total )) + geom_point() + geom_smooth(method = "auto", se = TRUE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
baby_addpopular <- babynames %>% mutate(popular = ifelse(is.na(prop) | is.null(prop), FALSE, ifelse(prop >= 0.01, TRUE, FALSE)))
#baby_addpopular %>% group_by(year, popular) %>% summarise(count = n())
baby_rename <- rename(baby_addpopular, isPopular = popular)
popular <- baby_rename %>% filter(isPopular == TRUE)
popular %>% arrange(desc(prop))
## # A tibble: 3,878 × 6
## year sex name n prop isPopular
## <dbl> <chr> <chr> <int> <dbl> <lgl>
## 1 1880 M John 9655 0.0815 TRUE
## 2 1881 M John 8769 0.0810 TRUE
## 3 1880 M William 9532 0.0805 TRUE
## 4 1883 M John 8894 0.0791 TRUE
## 5 1881 M William 8524 0.0787 TRUE
## 6 1882 M John 9557 0.0783 TRUE
## 7 1884 M John 9388 0.0765 TRUE
## 8 1882 M William 9298 0.0762 TRUE
## 9 1886 M John 9026 0.0758 TRUE
## 10 1885 M John 8756 0.0755 TRUE
## # ℹ 3,868 more rows
find_most_frequent_year <- function(input_name){
filter_by_name <- babynames %>% filter(name == input_name)
if(nrow(filter_by_name) ==0){
message(paste("The name "), input_name," does not exist in dataset", sep=" ")
return (NULL)
}
most_frequent_year <- filter_by_name %>%
filter(prop == max(prop)) %>%
select(year) %>%
unique()
return(most_frequent_year)
}
find_most_frequent_year("aa")
## The name aa does not exist in dataset
## NULL
find_most_frequent_year("John")
## # A tibble: 1 × 1
## year
## <dbl>
## 1 1880
find_most_common_name <- function(input_year){
filter_by_year <- babynames %>% filter(year == input_year)
if(nrow(filter_by_year) ==0){
message(paste("The year "), input_year," does not exist in dataset", sep=" ")
return (NULL)
}
most_common_name <- filter_by_year %>%
filter(prop == max(prop)) %>%
select(name) %>%
unique()
return(most_common_name)
}
find_most_common_name(1980)
## # A tibble: 1 × 1
## name
## <chr>
## 1 Michael
find_most_common_name(19801)
## The year 19801 does not exist in dataset
## NULL
most_time_given <- babynames %>% filter(name =="Michael") %>% summarise(most_time_given = max(n))
most_time_given
## # A tibble: 1 × 1
## most_time_given
## <int>
## 1 92695
n
and the function n() (that only works inside the summarize
function), consider the below summarize, by adding in your
name object. What does the value of num_rows represent (in
real-world terms) in the previous result? Explain it to your future
manager.babynames %>%
summarize(num_rows = n())
## # A tibble: 1 × 1
## num_rows
## <int>
## 1 1924665
Load the data in using the following command:
library(dplyr)
original_data <- read.csv("https://raw.githubusercontent.com/kshoemaker/Class_test/master/housedata.csv")
# renames a few variables for clarity
original_data <- original_data %>% select(TotalSqFeet = GrLivArea, PorchSqFeet = ScreenPorch, everything())
oldest_house_age <- original_data %>%
summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age
## oldest_house_age
## 1 152
How old is the oldest house?
How big is the biggest lot?
How much did the oldest house sell for?
library(dplyr)
library(ggplot2)
oldest_house_age <- original_data %>%
summarise(oldest_house_age = as.numeric(format(Sys.Date(),"%Y")) - min(YearBuilt))
oldest_house_age
## oldest_house_age
## 1 152
biggest_lot <- original_data %>%
summarise(biggest_lot = as.numeric(max(LotArea)))
biggest_lot
## biggest_lot
## 1 215245
oldest_house_price <- original_data %>%
arrange((YearBuilt))%>%
slice(1) %>%
pull(SalePrice)
oldest_house_price
## [1] 122000
What is the average total square footage and minimum lot area for each building type?
How many of the houses have central air?
Make a scatterplot with TotalSqFeet on the x-axis
and SalePrice on the y-axis, color the points by
BldgType.
Make a ggplot histogram of the porch square footage of all the houses.
Create a new variable has_porch in your dataframe
that indicates whether or not the house has a screened porch, something
like T/F or “yes”/“no”. How many houses have screened porches? How many
don’t?
Redo your histogram of the porch square footage, but now filter the data to only houses with screened porches.
# the average total square footage and minimum lot area for each building type
library(ggplot2)
avg_sqf <- original_data %>% group_by(BldgType) %>%
summarise(avg_sqf = mean(TotalSqFeet), minimum_lot = min(LotArea))
avg_sqf
## # A tibble: 5 × 3
## BldgType avg_sqf minimum_lot
## <chr> <dbl> <int>
## 1 1Fam 1539. 2500
## 2 2fmCon 1561. 4456
## 3 Duplex 1567. 6040
## 4 Twnhs 1276. 1526
## 5 TwnhsE 1312. 1300
num_central_air <- original_data %>% filter(CentralAir =="Y") %>% nrow()
num_central_air
## [1] 1365
ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)
# bonus create separate graph by building type
ggplot(data = original_data, aes(x = TotalSqFeet, y = SalePrice, color = BldgType)) + geom_point(size = 1) + scale_y_continuous(labels = scales::comma)+ facet_wrap(~ BldgType, scales = "free")
# histogram porch square footage of all houses
ggplot(data = original_data, aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "black") + labs(x = "Porch SQF", y = "Frequency")
data1 <- original_data %>% mutate(has_porch = ifelse(OpenPorchSF >0,"T","F"))
num_hasScreenPorch <- data1 %>% filter(has_porch == "T") %>% nrow()
num_hasScreenPorch
## [1] 804
num_noScreenPorch <- data1 %>% filter(has_porch == "F") %>% nrow()
num_noScreenPorch
## [1] 656
data1 %>% filter(has_porch =="T") %>% ggplot(aes(x = PorchSqFeet)) + geom_histogram(binwidth = 10, fill = "skyblue", color = "red") + labs(x = "Porch SQF--", y = "Frequency")