Loading the libraries tidiverse
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Load the data murders from dslabs
library(ggthemes)
library(ggrepel)
library(dslabs)
data(murders)
Study the features and dimensions of the data
str(murders)
## 'data.frame': 51 obs. of 5 variables:
## $ state : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ abb : chr "AL" "AK" "AZ" "AR" ...
## $ region : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
## $ population: num 4779736 710231 6392017 2915918 37253956 ...
## $ total : num 135 19 232 93 1257 ...
dim(murders)
## [1] 51 5
It has 51 rows and 5 columns. Plot the data
murders %>% ggplot() + geom_point(aes(x=population/10^6,y=total), size =3) + geom_text(aes(x=population/10^6,y=total,label=abb), nudge_x = 1)
murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 1.5)
log base 10 scale the x-axis and y-axis
murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 0.075) + scale_x_continuous(trans = "log10") + scale_y_continuous(trans = "log10")
restructruing the code
murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 0.075) + scale_x_log10() + scale_y_log10()
Proper arrangement into variables
p = murders %>% ggplot(aes(population/10^6, total, label=abb))
p = p + geom_point(size = 3, color="blue")
p = p + geom_text(nudge_x = 0.075)
p = p + scale_x_log10() + scale_y_log10()
p = p + xlab("Population in millions (log scale)") +
ylab("Total number of murders (log scale)") +
ggtitle("US Gun Murders in 2010")
print(p)
r <- murders %>%
summarize(rate = sum(total) / sum(population) * 10^6) %>%
pull(rate)
r
## [1] 30.34555
Basic line with average murder rate for the country
p = p + geom_point(aes(col = region), size = 3) + geom_abline(intercept = log10(r))
print(p)
p = p +
geom_abline(intercept = log10(r), lty = 2, color = "darkgrey") +
geom_point(aes(col = region), size = 3)
print(p)
ds_theme_set()
p = p + theme_economist()
print(p)
p = p + theme_fivethirtyeight()
print(p)
from the beginning
r = murders %>% summarise(rate = sum(total) / sum(population) * 10^6) %>% .$rate
print(r)
## [1] 30.34555
p = murders %>% ggplot(aes(population/10^6, total, label=abb)) +
geom_abline(intercept = log10(r), lty = 2, color = "darkgrey") +
geom_point(aes(col=region), size=3) +
geom_text_repel() +
scale_x_log10() + scale_y_log10() +
xlab("Population in millions (log scale)") +
ylab("Total number of murders (log scale)") +
ggtitle("US Gun Murders in 2010") +
scale_color_discrete(name = "Region") +
theme_economist()
print(p)