library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(knitr)
##
## Attaching package: 'knitr'
## The following object is masked from 'package:skimr':
##
## kable
library(ggfortify)
library(visdat)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(broom)
library(corrplot)
## corrplot 0.84 loaded
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(esquisse)
USArrests <- read_csv("USArrests.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## Murder = col_double(),
## Assault = col_double(),
## UrbanPop = col_double(),
## Rape = col_double()
## )
df <- USArrests %>%
rename(state = X1) %>%
rename (other = Rape) %>%
clean_names()
Let’s look at crime rate by state
df %>%
select(-urban_pop) %>%
gather(crime, rate, -state) %>%
ggplot() +
geom_col(aes(state, rate, fill = crime)) +
facet_wrap( ~ crime) +
coord_flip() +
theme_light() +
theme(legend.position = "none")
Just murder
example <- df %>%
mutate(state = fct_reorder(state, murder) %>%
fct_rev()) %>%
ggplot() +
geom_col(aes(state, murder)) +
theme(axis.text.x = element_text(angle = 90)) +
labs(x= "State", y = "Rate")
ggplotly(example)
Linear plot. Does urban population impact crime rates. Faceted by crime
df %>%
gather(crime, rate, -state, -urban_pop) %>%
ggplot(aes(urban_pop, rate, colour = crime)) +
geom_point() +
geom_smooth(aes(urban_pop, rate), method = "lm", se = F, colour = "black") +
facet_wrap( ~ crime, ncol = 1, scales = "free_y") +
theme_light()
How about using plotly?
example2 <- df %>%
gather(crime, rate, -state, -urban_pop) %>%
ggplot() +
###note the use of tooltip addition with text =
geom_point(aes(urban_pop, rate, colour = crime, text = paste("State:", state))) +
facet_wrap( ~ crime, ncol = 1, scales = "free_y") +
theme_light()
## Warning: Ignoring unknown aesthetics: text
###plot and only display the state (our "text" as defined above)
ggplotly(example2, tooltip = "text")
To add another geom, eg a trendline I need to add it serperately like so:
ggplotly(example2 + geom_smooth(aes(urban_pop, rate), method = "lm", se = F, colour = "black"))
geom_smooth(aes(urban_pop, rate), method = “lm”, se = F, colour = “black”) +
Does urban population assault rates? For assaults the relationship is not significant
test <- df %>%
### create long form data by stacking all crimes in one column
gather(crime, rate, -state, -urban_pop) %>%
### look at assaults only
filter(crime == "assault")
###simple linear model. Using "test" dataset, is the crime rate impacted by the urban population
fit <- lm(rate ~ urban_pop, test)
### use broom to give a tidy output
tidy(fit)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 73.1 53.9 1.36 0.181
## 2 urban_pop 1.49 0.803 1.86 0.0695
How would we test all crimes in one model? By using “nest”
blah <- df %>%
###long form as before
gather(crime, rate, -state, -urban_pop) %>%
## group by the crime
group_by(crime) %>%
### nest the rest of the data by the groups
nest() %>%
### make new column called fit. use the nested data (default of new nested column is "data", is the crime rate impacted by the urban population?, data = . - need to understand why)
mutate(fit = map(data, ~lm(rate ~ urban_pop, data = .))) %>%
### make a new column called results, take the list from the "fit" column and convert to nested cell. The list/new cell contains "glance"d summary results
mutate(results = map(fit,glance)) %>%
### convert nested cells to new columns
unnest(results)
Using broom
blah2 <- df %>%
gather(crime, rate, -state, -urban_pop) %>%
group_by(crime) %>%
nest() %>%
mutate(fit = map(data, ~lm(rate ~ urban_pop, data = .))) %>%
mutate(results = map(fit,tidy)) %>%
unnest(results)
or this way
blah3 <- df %>%
gather(crime, rate, -state, -urban_pop) %>%
group_by(crime) %>%
nest() %>%
mutate(fit = map(data, ~lm(rate ~ urban_pop, data = .) %>% tidy)) %>%
unnest(fit)
Example of corrplot
###remove character column
data <- df %>%
select(-state)
corrplot(cor(data), order = "hclust")
Using plotly directly, do a scatter plot and add regression line
####make new df
df2 <-df %>%
####calculate the lm of murder based on assaults and extract the fitted values to a new column called slope
mutate(slope = lm(murder ~assault) %>% fitted.values())
###plot df2
plot_ly(df2, x = ~assault, y = ~murder) %>%
###as a scatter plot
add_markers() %>%
###and add a regression line using assault and the calculated slope
add_lines(x = ~assault, y = ~slope) %>%
layout(showlegend = FALSE)