Loading the libraries tidiverse

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the data murders from dslabs

library(ggthemes)
library(ggrepel)
library(dslabs)
data(murders)

Study the features and dimensions of the data

str(murders)
## 'data.frame':    51 obs. of  5 variables:
##  $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ abb       : chr  "AL" "AK" "AZ" "AR" ...
##  $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
##  $ population: num  4779736 710231 6392017 2915918 37253956 ...
##  $ total     : num  135 19 232 93 1257 ...
dim(murders)
## [1] 51  5

It has 51 rows and 5 columns. Plot the data

murders %>% ggplot() + geom_point(aes(x=population/10^6,y=total), size =3) + geom_text(aes(x=population/10^6,y=total,label=abb), nudge_x = 1)

murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 1.5)

log base 10 scale the x-axis and y-axis

murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 0.075) + scale_x_continuous(trans = "log10") + scale_y_continuous(trans = "log10")

restructruing the code

murders %>% ggplot(aes(population/10^6,total,label=abb)) + geom_point(size=3) + geom_text(nudge_x = 0.075) + scale_x_log10() + scale_y_log10()

Proper arrangement into variables

p = murders %>% ggplot(aes(population/10^6, total, label=abb))
p = p + geom_point(size = 3, color="blue") 
p = p + geom_text(nudge_x = 0.075)
p = p + scale_x_log10() + scale_y_log10()
p = p + xlab("Population in millions (log scale)") +
    ylab("Total number of murders (log scale)") +
    ggtitle("US Gun Murders in 2010")
print(p)

r <- murders %>% 
  summarize(rate = sum(total) /  sum(population) * 10^6) %>% 
  pull(rate)
r
## [1] 30.34555

Basic line with average murder rate for the country

p = p + geom_point(aes(col = region), size = 3) + geom_abline(intercept = log10(r))
print(p)

p = p + 
    geom_abline(intercept = log10(r), lty = 2, color = "darkgrey") +
    geom_point(aes(col = region), size = 3)
print(p)

ds_theme_set()

p = p + theme_economist()
print(p)

p = p + theme_fivethirtyeight()
print(p)

from the beginning

  1. define intrecept
r = murders %>% summarise(rate = sum(total) / sum(population) * 10^6) %>% .$rate
print(r)
## [1] 30.34555
  1. make the plot
p = murders %>% ggplot(aes(population/10^6, total, label=abb)) + 
  geom_abline(intercept = log10(r), lty = 2, color = "darkgrey") + 
  geom_point(aes(col=region), size=3) + 
  geom_text_repel() + 
  scale_x_log10() + scale_y_log10() + 
  xlab("Population in millions (log scale)")  + 
  ylab("Total number of murders (log scale)") + 
  ggtitle("US Gun Murders in 2010") + 
  scale_color_discrete(name = "Region") + 
  theme_economist()
print(p)