knitr::opts_chunk$set(echo = TRUE)

Create Scatterplot

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(ggplot2)
library(dplyr)
setwd( "C:/Users/Jerome/Documents/Data_Science_110/Datasets")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Read Crimes Data Set

crime <- read_csv('http://datasets.flowingdata.com/crimeRatesByState2005.csv')
## Parsed with column specification:
## cols(
##   state = col_character(),
##   murder = col_double(),
##   forcible_rape = col_double(),
##   robbery = col_double(),
##   aggravated_assault = col_double(),
##   burglary = col_double(),
##   larceny_theft = col_double(),
##   motor_vehicle_theft = col_double(),
##   population = col_double()
## )

To write the R file as a .csv file, you must put .csv after the file name in the "". Otherwise the file stays in its original, non-.csv format.

write.csv (crime, file = "crime.csv")

Check out the first few lines

summary ("crimes")
##    Length     Class      Mode 
##         1 character character

Map the variables on the x and y axes

ggplot(crime, aes(murder, burglary))

# Change the theme
ggplot(crime, aes(x = murder, y = burglary)) +
  xlab("Murder rates in each state per 100,000") + 
  ylab("Burglary rates in each state per 100,000") +
  theme_minimal(base_size = 12)

## Show the data

p1 <- ggplot(crime, aes(x = murder, y = burglary)) +
  xlab("Murder rates in each state per 100,000") + 
  ylab("Burglary rates in each state per 100,000") +
  theme_minimal(base_size = 12)
p1 + geom_point()

## Remove DC and USA

crime2 <- crime[crime$state != "District of Columbia",]
crime2 <- crime2[crime2$state != "United States",]
p2 <- ggplot(crime2, aes(x = murder, y = burglary)) +
  xlab("Murder rates in each state per 100,000") + 
  ylab("Burglary rates in each state per 100,000") +
  theme_minimal(base_size = 12)
p2 + geom_point()

## Adjust the axes

p3 <- p2 + xlim(0,10) + ylim(0,1200)
p3 + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

## Add confidence interval w/ “Smoother”

p4 <- p3 + geom_point() + geom_smooth(color = "red")
p4
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## Add Regression Line

p5 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x)
p5
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## Add a title, make the line dashed, and remove the confidence interval

p6 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size = 0.3) +
  ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.")
p6 
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

What is the linear equation of that linear regression model?

cor(crime2$burglary, crime2$murder)
## [1] 0.6231757
fit1 <- lm(burglary ~ murder, data = crime2)
summary(fit1)
## 
## Call:
## lm(formula = burglary ~ murder, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -385.38 -132.23    2.97  138.78  386.32 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   398.26      59.28   6.718 1.99e-08 ***
## murder         62.17      11.26   5.521 1.34e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186.5 on 48 degrees of freedom
## Multiple R-squared:  0.3883, Adjusted R-squared:  0.3756 
## F-statistic: 30.48 on 1 and 48 DF,  p-value: 1.342e-06

More variables, scatterplot matrix

##install.packages("GGally")
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(crime2, columns = 2:8)

write.csv(crime2, "crime2.csv")

Now try to make a multiple regression model

fit2 <- lm(murder ~ burglary + forcible_rape + aggravated_assault + larceny_theft + motor_vehicle_theft, data = crime2)
summary(fit2)
## 
## Call:
## lm(formula = murder ~ burglary + forcible_rape + aggravated_assault + 
##     larceny_theft + motor_vehicle_theft, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0273 -0.8949 -0.1242  0.6644  3.3027 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.0252038  1.0642566   2.843 0.006761 ** 
## burglary             0.0050527  0.0014907   3.389 0.001488 ** 
## forcible_rape       -0.0461137  0.0216312  -2.132 0.038644 *  
## aggravated_assault   0.0094150  0.0022044   4.271 0.000102 ***
## larceny_theft       -0.0015024  0.0006003  -2.503 0.016104 *  
## motor_vehicle_theft  0.0020733  0.0012653   1.639 0.108440    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.522 on 44 degrees of freedom
## Multiple R-squared:  0.6287, Adjusted R-squared:  0.5866 
## F-statistic:  14.9 on 5 and 44 DF,  p-value: 1.49e-08
plot(fit2)

Drop Auto Theft

fit3 <- lm(murder ~ burglary + forcible_rape + aggravated_assault + larceny_theft, data = crime2)
summary(fit3)
## 
## Call:
## lm(formula = murder ~ burglary + forcible_rape + aggravated_assault + 
##     larceny_theft, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1805 -1.0081 -0.3100  0.8769  3.4506 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.8415937  1.0779713   2.636 0.011468 *  
## burglary            0.0055794  0.0014826   3.763 0.000483 ***
## forcible_rape      -0.0471186  0.0220236  -2.139 0.037858 *  
## aggravated_assault  0.0096735  0.0022395   4.320 8.51e-05 ***
## larceny_theft      -0.0012679  0.0005938  -2.135 0.038211 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.55 on 45 degrees of freedom
## Multiple R-squared:  0.6061, Adjusted R-squared:  0.5711 
## F-statistic: 17.31 on 4 and 45 DF,  p-value: 1.153e-08
plot(fit3)

## Predict Burglary

fit4 <- lm(burglary ~ murder + forcible_rape + aggravated_assault + larceny_theft + motor_vehicle_theft, data = crime2)
summary(fit4)
## 
## Call:
## lm(formula = burglary ~ murder + forcible_rape + aggravated_assault + 
##     larceny_theft + motor_vehicle_theft, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -271.30  -79.31   -7.40   94.15  387.49 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -138.27831  102.16469  -1.353  0.18281    
## murder                40.97562   12.08935   3.389  0.00149 ** 
## forcible_rape          2.19565    2.01915   1.087  0.28278    
## aggravated_assault     0.12375    0.23536   0.526  0.60168    
## larceny_theft          0.22136    0.04716   4.693 2.64e-05 ***
## motor_vehicle_theft    0.06020    0.11702   0.514  0.60955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137 on 44 degrees of freedom
## Multiple R-squared:  0.6975, Adjusted R-squared:  0.6631 
## F-statistic: 20.29 on 5 and 44 DF,  p-value: 1.907e-10
plot(fit4)

Burglary based on murder + Larceny

fit5 <- lm(burglary ~ murder+ larceny_theft, data = crime2)
summary(fit5)
## 
## Call:
## lm(formula = burglary ~ murder + larceny_theft, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -297.93 -101.83    2.21   98.54  352.09 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -117.85224   90.22758  -1.306    0.198    
## murder          47.15078    8.55929   5.509 1.48e-06 ***
## larceny_theft    0.25566    0.03919   6.524 4.31e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136.6 on 47 degrees of freedom
## Multiple R-squared:  0.679,  Adjusted R-squared:  0.6654 
## F-statistic: 49.72 on 2 and 47 DF,  p-value: 2.522e-12
plot(fit5)

## Predict murder based on robbery and rape

fit66 <- lm(murder ~ robbery + forcible_rape, data = crime2)
summary(fit66)
## 
## Call:
## lm(formula = murder ~ robbery + forcible_rape, data = crime2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4912 -0.8523 -0.1465  0.8297  4.7814 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.444442   0.806105   0.551    0.584    
## robbery       0.031397   0.003845   8.166 1.44e-10 ***
## forcible_rape 0.030869   0.019368   1.594    0.118    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.542 on 47 degrees of freedom
## Multiple R-squared:  0.5928, Adjusted R-squared:  0.5754 
## F-statistic:  34.2 on 2 and 47 DF,  p-value: 6.786e-10
plot(fit66)

Murders and burglaries w/ the states’ populations added

p3 +
  geom_point(mapping = aes(murder, burglary, size = population), color = "red") + xlim(0,10) + ylim(0,1200) +
  ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.", subtitle = "Sizes of circles are proportional to state populations")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).

## Use Plotly

p <- ggplot(crime2, aes(x = murder, y = burglary, size = population, text = paste("state:", state))) + theme_minimal(base_size = 12) +
     geom_point(alpha = 0.5, color = "red") + xlim(0,10) + ylim(0,1200) +
  ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.", subtitle = "Sizes of circles are proportional to state populations")
p <- ggplotly(p)
p

## Food Stamps Plot

food_stamps <- read_csv("food_stamps.csv")
## Parsed with column specification:
## cols(
##   year = col_double(),
##   participants = col_double(),
##   costs = col_double()
## )
food_stamps_chart <- ggplot(food_stamps, aes(x = year, y = participants)) + 
  xlab("Year") +
  ylab("Participants (millions)") +
  theme_minimal(base_size = 14)
food_stamps_chart

## Food Stamps Chart

food_stamps_chart +  
  geom_line()

food_stamps_chart +
  geom_line(size = 1.5, color = "red") +
  ggtitle("Line chart")

## Different Charts

food_stamps_chart +
  geom_line() +
  geom_point() +
  ggtitle("Dot-and-line chart")

food_stamps_chart +
  geom_bar(stat = "identity") +
  ggtitle("Column chart") +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())

# Make a bar chart
food_stamps_chart +
  geom_bar(stat = "identity") +
  ggtitle("Bar chart") +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
  coord_flip()

## The difference between color and fill

# set color and fill
food_stamps_chart +
  geom_bar(stat = "identity", color = "#888888", fill = "#CCCCCC", alpha = 0.5) +
  ggtitle("Column chart")

Map color to the values of a continuous variable

food_stamps_chart +
  geom_bar(stat = "identity", color= "white", aes(fill = costs))

use a colorbrewer sequential palette

food_stamps_chart +
  geom_bar(stat = "identity", color = "#888888", aes(fill = costs)) +
  scale_fill_distiller(name = "Cost\n($ billion)", palette = "Reds", direction = 1)

## Control Legend Position

food_stamps_chart +
  geom_bar(stat="identity", color = "#888888", aes(fill=costs)) +
  scale_fill_distiller(name = "Cost\n($ billion)", palette = "Reds", direction = 1) +
  theme(legend.position=c(0.15,0.8))