knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(ggplot2)
library(dplyr)
setwd( "C:/Users/Jerome/Documents/Data_Science_110/Datasets")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
crime <- read_csv('http://datasets.flowingdata.com/crimeRatesByState2005.csv')
## Parsed with column specification:
## cols(
## state = col_character(),
## murder = col_double(),
## forcible_rape = col_double(),
## robbery = col_double(),
## aggravated_assault = col_double(),
## burglary = col_double(),
## larceny_theft = col_double(),
## motor_vehicle_theft = col_double(),
## population = col_double()
## )
write.csv (crime, file = "crime.csv")
summary ("crimes")
## Length Class Mode
## 1 character character
ggplot(crime, aes(murder, burglary))
# Change the theme
ggplot(crime, aes(x = murder, y = burglary)) +
xlab("Murder rates in each state per 100,000") +
ylab("Burglary rates in each state per 100,000") +
theme_minimal(base_size = 12)
## Show the data
p1 <- ggplot(crime, aes(x = murder, y = burglary)) +
xlab("Murder rates in each state per 100,000") +
ylab("Burglary rates in each state per 100,000") +
theme_minimal(base_size = 12)
p1 + geom_point()
## Remove DC and USA
crime2 <- crime[crime$state != "District of Columbia",]
crime2 <- crime2[crime2$state != "United States",]
p2 <- ggplot(crime2, aes(x = murder, y = burglary)) +
xlab("Murder rates in each state per 100,000") +
ylab("Burglary rates in each state per 100,000") +
theme_minimal(base_size = 12)
p2 + geom_point()
## Adjust the axes
p3 <- p2 + xlim(0,10) + ylim(0,1200)
p3 + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
## Add confidence interval w/ “Smoother”
p4 <- p3 + geom_point() + geom_smooth(color = "red")
p4
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Add Regression Line
p5 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x)
p5
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Add a title, make the line dashed, and remove the confidence interval
p6 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size = 0.3) +
ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.")
p6
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
cor(crime2$burglary, crime2$murder)
## [1] 0.6231757
fit1 <- lm(burglary ~ murder, data = crime2)
summary(fit1)
##
## Call:
## lm(formula = burglary ~ murder, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -385.38 -132.23 2.97 138.78 386.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 398.26 59.28 6.718 1.99e-08 ***
## murder 62.17 11.26 5.521 1.34e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 186.5 on 48 degrees of freedom
## Multiple R-squared: 0.3883, Adjusted R-squared: 0.3756
## F-statistic: 30.48 on 1 and 48 DF, p-value: 1.342e-06
##install.packages("GGally")
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(crime2, columns = 2:8)
write.csv(crime2, "crime2.csv")
fit2 <- lm(murder ~ burglary + forcible_rape + aggravated_assault + larceny_theft + motor_vehicle_theft, data = crime2)
summary(fit2)
##
## Call:
## lm(formula = murder ~ burglary + forcible_rape + aggravated_assault +
## larceny_theft + motor_vehicle_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0273 -0.8949 -0.1242 0.6644 3.3027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0252038 1.0642566 2.843 0.006761 **
## burglary 0.0050527 0.0014907 3.389 0.001488 **
## forcible_rape -0.0461137 0.0216312 -2.132 0.038644 *
## aggravated_assault 0.0094150 0.0022044 4.271 0.000102 ***
## larceny_theft -0.0015024 0.0006003 -2.503 0.016104 *
## motor_vehicle_theft 0.0020733 0.0012653 1.639 0.108440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.522 on 44 degrees of freedom
## Multiple R-squared: 0.6287, Adjusted R-squared: 0.5866
## F-statistic: 14.9 on 5 and 44 DF, p-value: 1.49e-08
plot(fit2)
fit3 <- lm(murder ~ burglary + forcible_rape + aggravated_assault + larceny_theft, data = crime2)
summary(fit3)
##
## Call:
## lm(formula = murder ~ burglary + forcible_rape + aggravated_assault +
## larceny_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1805 -1.0081 -0.3100 0.8769 3.4506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8415937 1.0779713 2.636 0.011468 *
## burglary 0.0055794 0.0014826 3.763 0.000483 ***
## forcible_rape -0.0471186 0.0220236 -2.139 0.037858 *
## aggravated_assault 0.0096735 0.0022395 4.320 8.51e-05 ***
## larceny_theft -0.0012679 0.0005938 -2.135 0.038211 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.55 on 45 degrees of freedom
## Multiple R-squared: 0.6061, Adjusted R-squared: 0.5711
## F-statistic: 17.31 on 4 and 45 DF, p-value: 1.153e-08
plot(fit3)
## Predict Burglary
fit4 <- lm(burglary ~ murder + forcible_rape + aggravated_assault + larceny_theft + motor_vehicle_theft, data = crime2)
summary(fit4)
##
## Call:
## lm(formula = burglary ~ murder + forcible_rape + aggravated_assault +
## larceny_theft + motor_vehicle_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -271.30 -79.31 -7.40 94.15 387.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -138.27831 102.16469 -1.353 0.18281
## murder 40.97562 12.08935 3.389 0.00149 **
## forcible_rape 2.19565 2.01915 1.087 0.28278
## aggravated_assault 0.12375 0.23536 0.526 0.60168
## larceny_theft 0.22136 0.04716 4.693 2.64e-05 ***
## motor_vehicle_theft 0.06020 0.11702 0.514 0.60955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 137 on 44 degrees of freedom
## Multiple R-squared: 0.6975, Adjusted R-squared: 0.6631
## F-statistic: 20.29 on 5 and 44 DF, p-value: 1.907e-10
plot(fit4)
fit5 <- lm(burglary ~ murder+ larceny_theft, data = crime2)
summary(fit5)
##
## Call:
## lm(formula = burglary ~ murder + larceny_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -297.93 -101.83 2.21 98.54 352.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -117.85224 90.22758 -1.306 0.198
## murder 47.15078 8.55929 5.509 1.48e-06 ***
## larceny_theft 0.25566 0.03919 6.524 4.31e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136.6 on 47 degrees of freedom
## Multiple R-squared: 0.679, Adjusted R-squared: 0.6654
## F-statistic: 49.72 on 2 and 47 DF, p-value: 2.522e-12
plot(fit5)
## Predict murder based on robbery and rape
fit66 <- lm(murder ~ robbery + forcible_rape, data = crime2)
summary(fit66)
##
## Call:
## lm(formula = murder ~ robbery + forcible_rape, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4912 -0.8523 -0.1465 0.8297 4.7814
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.444442 0.806105 0.551 0.584
## robbery 0.031397 0.003845 8.166 1.44e-10 ***
## forcible_rape 0.030869 0.019368 1.594 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.542 on 47 degrees of freedom
## Multiple R-squared: 0.5928, Adjusted R-squared: 0.5754
## F-statistic: 34.2 on 2 and 47 DF, p-value: 6.786e-10
plot(fit66)
p3 +
geom_point(mapping = aes(murder, burglary, size = population), color = "red") + xlim(0,10) + ylim(0,1200) +
ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.", subtitle = "Sizes of circles are proportional to state populations")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Warning: Removed 1 rows containing missing values (geom_point).
## Use Plotly
p <- ggplot(crime2, aes(x = murder, y = burglary, size = population, text = paste("state:", state))) + theme_minimal(base_size = 12) +
geom_point(alpha = 0.5, color = "red") + xlim(0,10) + ylim(0,1200) +
ggtitle("MURDERS VERSUS BURGLARIES IN THE U.S.", subtitle = "Sizes of circles are proportional to state populations")
p <- ggplotly(p)
p
## Food Stamps Plot
food_stamps <- read_csv("food_stamps.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## participants = col_double(),
## costs = col_double()
## )
food_stamps_chart <- ggplot(food_stamps, aes(x = year, y = participants)) +
xlab("Year") +
ylab("Participants (millions)") +
theme_minimal(base_size = 14)
food_stamps_chart
## Food Stamps Chart
food_stamps_chart +
geom_line()
food_stamps_chart +
geom_line(size = 1.5, color = "red") +
ggtitle("Line chart")
## Different Charts
food_stamps_chart +
geom_line() +
geom_point() +
ggtitle("Dot-and-line chart")
food_stamps_chart +
geom_bar(stat = "identity") +
ggtitle("Column chart") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
# Make a bar chart
food_stamps_chart +
geom_bar(stat = "identity") +
ggtitle("Bar chart") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()) +
coord_flip()
## The difference between color and fill
# set color and fill
food_stamps_chart +
geom_bar(stat = "identity", color = "#888888", fill = "#CCCCCC", alpha = 0.5) +
ggtitle("Column chart")
food_stamps_chart +
geom_bar(stat = "identity", color= "white", aes(fill = costs))
food_stamps_chart +
geom_bar(stat = "identity", color = "#888888", aes(fill = costs)) +
scale_fill_distiller(name = "Cost\n($ billion)", palette = "Reds", direction = 1)
## Control Legend Position
food_stamps_chart +
geom_bar(stat="identity", color = "#888888", aes(fill=costs)) +
scale_fill_distiller(name = "Cost\n($ billion)", palette = "Reds", direction = 1) +
theme(legend.position=c(0.15,0.8))