library(tidyverse)
library(broom)
library(plotly)
library(tidycensus) # gets census data that we can use to create maps
library(sf) # helper package for mapping
Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(leaflet) # interactive mapping package
library(trendyy)
library(usdata) # this package has a conversion utility for state abbreviations to full names
Assignment
census_api_key("686d80e4f7ee63c3023fcefac14d068f56cc4d19", install = TRUE)
Error: A CENSUS_API_KEY already exists. You can overwrite it with the argument overwrite=TRUE
I first entered my census api key.
smokers <- read_csv("Cig_smoking_percent.csv")
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): State
dbl (3): Cig_percent, Low_Confidence_Limit, High_Confidence_Limit
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
I then read in the excel data for Cigarette Smokers
states_leaflet <- get_acs(geography = "state",
variables = "cig_percent",
geometry = TRUE)
Getting data from the 2016-2020 5-year ACS
Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
Error: Your API call has errors. The API message returned is error: error: unknown variable 'cig_percentE'.
I used the code above to obtain state population and map data from the census bureau.
nicorette_colors <- colorNumeric(palette = "viridis", domain = nicorette_data$Cig_percent)
states_leaflet %>%
rename(location = NAME) %>%
inner_join(nicorette_data) %>%
leaflet() %>%
addTiles() %>%
addPolygons(weight = 1,
fillColor = ~nicorette_colors(Cig_percent),
label = ~paste0(location, ", Cig Percent = ", Cig_percent),
highlight = highlightOptions(weight = 2)) %>%
setView(-95, 40, zoom = 4) %>%
addLegend(pal = nicorette_colors, values = ~Cig_percent)
Joining, by = "location"
Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
Need '+proj=longlat +datum=WGS84'
Lastly, I created a chloropleth map using leaflet to show the percent of cigarette smokers among US states that you can hover over to display the information.
nicorette <- trendy("nicorette",
geo = "US",
from = "2020-01-01", to = "2021-01-01")
nicorette_states <- nicorette %>%
get_interest_region()
nicorette_states
I chose the Google search term nicorette because I felt it would be closely related to populations of smokers as most would probably like to quit the habit at some point or another.
nicorette_colors <- colorNumeric(palette = "viridis", domain = nicorette_states$hits)
states_leaflet %>%
rename(location = NAME) %>%
inner_join(nicorette_states) %>%
leaflet() %>%
addTiles() %>%
addPolygons(weight = 1,
fillColor = ~nicorette_colors(hits),
label = ~paste0(location, ", Search volume for nicorette = ", hits),
highlight = highlightOptions(weight = 2)) %>%
setView(-95, 40, zoom = 4) %>%
addLegend(pal = nicorette_colors, values = ~hits)
Joining, by = "location"
Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
Need '+proj=longlat +datum=WGS84'
I used the code above to create a choloropleth map for search hits for nicorette in the US. Vermont had the highest rate of hits followed by New York.
smokers <- read_csv("Cig_smoking_percent.csv")
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): State
dbl (3): Cig_percent, Low_Confidence_Limit, High_Confidence_Limit
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(smokers)
Rows: 52
Columns: 5
$ State <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS"…
$ Cig_percent <dbl> 19.2, 19.1, 14.0, 22.7, 11.2, 14.5, 12.2, 16.5, 13.8, 14.5, 16.1, 13.4, 14.7, 15.5, 21.1, 16.6, 17.2…
$ Low_Confidence_Limit <dbl> 17.9, 16.9, 12.7, 20.9, 10.4, 13.6, 11.3, 15.1, 12.3, 13.4, 15.1, 12.3, 13.0, 14.2, 19.8, 15.7, 16.2…
$ High_Confidence_Limit <dbl> 20.5, 21.3, 15.3, 24.5, 12.0, 15.4, 13.1, 17.9, 15.3, 15.6, 17.1, 14.5, 16.4, 16.8, 22.4, 17.5, 18.2…
$ Sample_Size <dbl> 6347, 2698, 7758, 5177, 11118, 8188, 10276, 5011, 4137, 14589, 8783, 7566, 3594, 5128, 7269, 8842, 1…
glimpse(nicorette_states)
Rows: 51
Columns: 5
$ location <chr> "Vermont", "New York", "New Hampshire", "Rhode Island", "Kentucky", "Oregon", "Pennsylvania", "North Carolina", "…
$ hits <int> 100, 69, 52, 51, 51, 47, 47, 46, 45, 45, 44, 44, 44, 44, 43, 43, 42, 42, 42, 41, 41, 40, 40, 40, 39, 39, 38, 37, …
$ keyword <chr> "nicorette", "nicorette", "nicorette", "nicorette", "nicorette", "nicorette", "nicorette", "nicorette", "nicorett…
$ geo <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",…
$ gprop <chr> "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "…
nicorette_states %>%
mutate(State = state2abbr(location)) %>%
inner_join(smokers)
Joining, by = "State"
nicorette_data <- nicorette_states %>%
mutate(State = state2abbr(location)) %>%
inner_join(smokers)
Joining, by = "State"
nicorette_model <- lm(Cig_percent ~ hits, data = nicorette_data)
glance(nicorette_model)
nicorette_data %>%
drop_na() %>%
plot_ly(x = ~hits,
y = ~Cig_percent,
hoverinfo = "text",
text = ~paste("State: ", location, "<br>", "'Nicorette' search rate: ", hits, "<br>", "Cig smoking percent: ", Cig_percent)) %>%
add_markers(showlegend = F) %>%
add_lines(y = ~fitted(nicorette_model)) %>%
layout(title = "Relationship between google searches for 'nicorette' and smoking percent rates, by state",
xaxis = list(title = "Google search hits for 'nicorette'"),
yaxis = list(title = "State smoking rate"))
NA
According to the statistical analysis for the relationship between Google search hits for nicorette, and smoking percent rate data, the relationship is not statistically significant (p = .42)
tobacco <- trendy("tobacco",
geo = "US",
from = "2020-01-01", to = "2021-01-01")
tobacco_states <- tobacco %>%
get_interest_region()
tobacco_states
tobacco_colors <- colorNumeric(palette = "viridis", domain = tobacco_states$hits)
states_leaflet %>%
rename(location = NAME) %>%
inner_join(tobacco_states) %>%
leaflet() %>%
addTiles() %>%
addPolygons(weight = 1,
fillColor = ~tobacco_colors(hits),
label = ~paste0(location, ", Search volume for tobacco = ", hits),
highlight = highlightOptions(weight = 2)) %>%
setView(-95, 40, zoom = 4) %>%
addLegend(pal = tobacco_colors, values = ~hits)
Joining, by = "location"
Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
Need '+proj=longlat +datum=WGS84'
The second term related to cigarette smokers that I searched for was ‘tobacco.’ I used trendy and the code above to create a chloropleth map for search hits for tobacco. The lighter states show a higher search rate, the darker purple states show a lower volume.
smokers <- read_csv("Cig_smoking_percent.csv")
Rows: 52 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): State
dbl (3): Cig_percent, Low_Confidence_Limit, High_Confidence_Limit
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(smokers)
Rows: 52
Columns: 5
$ State <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS"…
$ Cig_percent <dbl> 19.2, 19.1, 14.0, 22.7, 11.2, 14.5, 12.2, 16.5, 13.8, 14.5, 16.1, 13.4, 14.7, 15.5, 21.1, 16.6, 17.2…
$ Low_Confidence_Limit <dbl> 17.9, 16.9, 12.7, 20.9, 10.4, 13.6, 11.3, 15.1, 12.3, 13.4, 15.1, 12.3, 13.0, 14.2, 19.8, 15.7, 16.2…
$ High_Confidence_Limit <dbl> 20.5, 21.3, 15.3, 24.5, 12.0, 15.4, 13.1, 17.9, 15.3, 15.6, 17.1, 14.5, 16.4, 16.8, 22.4, 17.5, 18.2…
$ Sample_Size <dbl> 6347, 2698, 7758, 5177, 11118, 8188, 10276, 5011, 4137, 14589, 8783, 7566, 3594, 5128, 7269, 8842, 1…
glimpse(tobacco_states)
Rows: 51
Columns: 5
$ location <chr> "Minnesota", "Arkansas", "North Carolina", "Virginia", "Tennessee", "Kentucky", "West Virginia", "Michigan", "Mar…
$ hits <int> 100, 99, 92, 78, 70, 69, 63, 61, 58, 57, 56, 55, 55, 54, 53, 53, 52, 52, 51, 51, 51, 50, 48, 48, 48, 45, 44, 42, …
$ keyword <chr> "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "tobacco", "to…
$ geo <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",…
$ gprop <chr> "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "web", "…
tobacco_states %>%
mutate(State = state2abbr(location)) %>%
inner_join(smokers)
Joining, by = "State"
tobacco_data <- tobacco_states %>%
mutate(State = state2abbr(location)) %>%
inner_join(smokers)
Joining, by = "State"
tobacco_model <- lm(Cig_percent ~ hits, data = tobacco_data)
glance(tobacco_model)
tobacco_data %>%
drop_na() %>%
plot_ly(x = ~hits,
y = ~Cig_percent,
hoverinfo = "text",
text = ~paste("State: ", location, "<br>", "'Tobacco' search rate: ", hits, "<br>", "Cig smoking percent: ", Cig_percent)) %>%
add_markers(showlegend = F) %>%
add_lines(y = ~fitted(tobacco_model)) %>%
layout(title = "Relationship between google searches for 'tobacco' and smoking percent rates, by state",
xaxis = list(title = "Google search hits for 'tobacco'"),
yaxis = list(title = "State smoking rate"))
NA
According to the statistical analysis done above, Google searches for ‘tobacco,’ and cigarette smoking percentage by state do have a statistically significant relationship (p=.0001). There is a positive linear correlation between searches for tobacco and percentage of cigarette smokers. The more searches for tobacco on the web, the higher percentage of smokers in that state.