options(scipen = 999)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.3 v purrr 0.3.4
v tibble 3.0.4 v dplyr 1.0.2
v tidyr 1.1.2 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.0
-- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
boston <- read_csv("boston_2020.csv")
-- Column specification --------------------------------------------------------------------------------------------------------------------------------------
cols(
.default = col_character(),
PID = col_double(),
AV_TOTAL = col_double(),
LAND_SF = col_double(),
YR_BUILT = col_double(),
YR_REMOD = col_double(),
LIVING_AREA = col_double(),
NUM_FLOORS = col_double(),
STRUCTURE_CLASS = col_logical(),
R_TOTAL_RMS = col_double(),
R_BDRMS = col_double(),
R_FULL_BTH = col_double(),
R_HALF_BTH = col_double(),
R_FPLACE = col_double()
)
i Use `spec()` for the full column specifications.
zips <- read_csv("boston_zips.csv")
-- Column specification --------------------------------------------------------------------------------------------------------------------------------------
cols(
ZIP = col_character(),
Population = col_double(),
Pop_Density = col_double(),
Median_Income = col_double(),
City_State = col_character()
)
truecar <- read_csv("true_car_prices_50k.csv")
-- Column specification --------------------------------------------------------------------------------------------------------------------------------------
cols(
vin = col_character(),
make = col_character(),
model = col_character(),
year = col_double(),
price = col_double(),
mileage = col_double(),
city = col_character(),
state = col_character(),
region = col_character(),
population = col_double(),
lat = col_double(),
lng = col_double()
)
framework for our function, here we want to create working code of what we want our function to do.
freq <- truecar %>%
group_by(make) %>%
summarise(n = n()) %>%
mutate(pct = n/sum(n)) %>%
arrange(desc(n)) %>%
top_n(20,n)
`summarise()` ungrouping output (override with `.groups` argument)
print(freq)
freq %>%
ggplot(aes(reorder(make,n),n)) +
geom_col() +
coord_flip() +
labs(title = "Frequency Analysis",
y = "count",
x = "category")
like this.
freq_function <- function(){
# frequency analysis of true car by make
freq <- truecar %>%
group_by(make) %>%
summarise(n = n()) %>%
mutate(pct = n/sum(n)) %>%
arrange(desc(n)) %>%
top_n(20,n)
print(freq)
freq %>%
ggplot(aes(reorder(make,n),n)) +
geom_col() +
coord_flip() +
labs(title = "Frequency Analysis",
y = "count",
x = "category")
}
freq_function()
add the argument data and replace truecar with the argument
freq_function <- function(data){
# frequency analysis of true car by make
freq <- data %>%
group_by(make) %>%
summarise(n = n()) %>%
mutate(pct = n/sum(n)) %>%
arrange(desc(n)) %>%
top_n(20,n)
print(freq)
freq %>%
ggplot(aes(reorder(make,n),n)) +
geom_col() +
coord_flip() +
labs(title = "Frequency Analysis",
y = "count",
x = "category")
}
freq_function(truecar)
`summarise()` ungrouping output (override with `.groups` argument)
you want to pass it a column to perform frequency analysis of to do this you need to tell R that the string passed is in fact a column. to do this we use !!as.name(“string”) this tells R that the string is a column name
freq_function <- function(data, column){
# frequency analysis of true car by make
freq <- data %>%
group_by(!!as.name(column)) %>%
summarise(n = n()) %>%
mutate(pct = n/sum(n)) %>%
arrange(desc(n)) %>%
top_n(20,n)
print(freq)
freq %>%
ggplot(aes(reorder(!!as.name(column),n),n)) +
geom_col() +
coord_flip() +
labs(title = "Frequency Analysis",
subtitle = column,
y = "count",
x = "category")
}
freq_function(truecar, "make")
`summarise()` ungrouping output (override with `.groups` argument)
freq_function(truecar, "model")
`summarise()` ungrouping output (override with `.groups` argument)
freq_function(truecar, "year")
`summarise()` ungrouping output (override with `.groups` argument)
freq_function(truecar, "city")
`summarise()` ungrouping output (override with `.groups` argument)
freq_function(truecar, "state")
`summarise()` ungrouping output (override with `.groups` argument)
your challenge is to add the mean, min, max, and column name to the table returned by the function call.
get_stats <- function(dataframe, column){
dataframe %>%
summarise(n = n(),
n_distinct = n_distinct(!!as.name(column)),
n_miss = sum(is.na(!!as.name(column))),
mean = mean(!!as.name(column)),
min = min(!!as.name(column)),
max = max(!!as.name(column))) %>%
mutate( column = column)
# add mean, min, max
# %>%
# add a column name ~ mutate(column = column)
}
get_stats(boston,"AV_TOTAL")
get_stats(boston, "LIVING_AREA")
get_stats(truecar, "price")
get_stats(truecar, "mileage")
group_by_column <- "ZIPCODE"
mean_column <- "AV_TOTAL"
dataset <- boston
get_chart <- function(dataframe, group_by_column, mean_column){
res <- dataframe %>%
group_by(!!as.name(group_by_column)) %>%
summarise(mean = mean(!!as.name(mean_column))) # change this
res %>% ggplot(aes(reorder(!!as.name(group_by_column), mean), mean)) +
geom_col() +
labs(title = paste("mean ",mean_column, " by ", group_by_column),
x = "ZIPCODE",
y = "Mean AV_TOTAL", # fix this
caption = "Dataset: Boston")
}
group_by_column2 <- "R_BLDG_STYL"
mean_column2 <- "LIVING_AREA"
dataset <- boston
get_chart2 <- function(dataframe, group_by_column2, mean_column2){
res <- dataframe %>%
group_by(!!as.name(group_by_column2)) %>%
summarise(mean = mean(!!as.name(mean_column2))) # change this
res %>% ggplot(aes(reorder(!!as.name(group_by_column2), mean), mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("mean ",mean_column2, " by ", group_by_column2),
x = "R_BLDG_STYL",
y = "Mean LIVING_AREA", # fix this
caption = "Dataset: Boston")
}
group_by_column3 <- "make"
mean_column3 <- "price"
dataset <- truecar
get_chart3 <- function(dataframe, group_by_column3, mean_column3){
res <- dataframe %>%
group_by(!!as.name(group_by_column3)) %>%
summarise(mean = mean(!!as.name(mean_column3))) # change this
res %>% ggplot(aes(reorder(!!as.name(group_by_column3), mean), mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("mean ",mean_column3, " by ", group_by_column3),
x = "make",
y = "Mean price", # fix this
caption = "Dataset: truecar")
}
group_by_column4 <- "make"
mean_column4 <- "mileage"
dataset <- truecar
get_chart4 <- function(dataframe, group_by_column4, mean_column4){
res <- dataframe %>%
group_by(!!as.name(group_by_column4)) %>%
summarise(mean = mean(!!as.name(mean_column4))) # change this
res %>% ggplot(aes(reorder(!!as.name(group_by_column4), mean), mean)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("mean ",mean_column4, " by ", group_by_column4),
x = "make",
y = "Mean mileage", # fix this
caption = "Dataset: truecar")
}
get_chart(boston, "ZIPCODE", "AV_TOTAL")
`summarise()` ungrouping output (override with `.groups` argument)
get_chart2(boston, "R_BLDG_STYL", "LIVING_AREA")
`summarise()` ungrouping output (override with `.groups` argument)
get_chart3(truecar, "make", "price")
`summarise()` ungrouping output (override with `.groups` argument)
get_chart4(truecar, "make", "mileage")
`summarise()` ungrouping output (override with `.groups` argument)