I will mainly be using tidyr and dplyr here to do the analysis.
Thanks to Vyanna Hill for the analysis prompt:
Find out if there is a trend of type of facilities and their risk level given by the inspector.
To answer the questions above I will be doing a lot of sorting and group_by + filtering.
my_git_url <- getURL("https://raw.githubusercontent.com/aelsaeyed/Data607/main/Project2/Food/food-inspections.csv")
insp_raw <- read.csv(text = my_git_url)
head(insp_raw, 10)
insp_cleaned = insp_raw %>%
select(2,5,6,7,10)
head(insp_cleaned)
res_grp <- insp_cleaned %>%
separate(Risk, c("Risk", "risknum", "risk_lvl")) %>%
mutate(`risknum` = as.integer(risknum)) %>%
select(1,2,4,6,7) %>%
group_by(DBA.Name) %>%
summarize(
count_highrisk = sum(risknum[risknum==1]),
count_medrisk = sum(risknum[risknum==2]),
count_lowrisk = sum(risknum[risknum==3]),
count_total = sum(risknum[risknum])
) %>%
mutate(`rate_high_risk` = (count_highrisk/count_total) ) %>%
filter(count_highrisk > 0) %>%
arrange(rate_high_risk) %>%
select(1,2,5,6)
## Warning: Expected 3 pieces. Additional pieces discarded in 196725 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 100 rows [21,
## 81, 136, 1562, 1961, 2178, 2439, 2751, 3000, 3485, 5459, 5828, 5881, 6728, 6877,
## 8041, 9856, 10119, 13610, 14581, ...].
head(res_grp)