if (!require("tidyverse")) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require("tidycensus")) install.packages("tidycensus")
## Loading required package: tidycensus
if (!require("sf")) install.packages("sf")
## Loading required package: sf
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
if (!require("mapview")) install.packages("mapview")
## Loading required package: mapview
library(tidyverse)
library(tidycensus)
library(sf)
library(mapview)
# Transmitting API key
# census_api_key("33060ddff9e66f647b10ef7b540ab19f3dd5e9c4")
# Fetching ACS codebooks
DetailedTables <- load_variables(2022, "acs5", cache = TRUE)
SubjectTables <- load_variables(2022, "acs5/subject", cache = TRUE)
ProfileTables <- load_variables(2022, "acs5/profile", cache = TRUE)
All_ACS_Variables <- bind_rows(DetailedTables, ProfileTables)
All_ACS_Variables <- bind_rows(All_ACS_Variables, SubjectTables)
rm (DetailedTables, SubjectTables, ProfileTables)
# Specify a variable to estimate
VariableList =
c(Estimate_ = "DP04_0047P")
# Fetching data
mydata <- get_acs(
geography = "county",
state = "TN",
variables = VariableList,
year = 2022,
survey = "acs5",
output = "wide",
geometry = TRUE)
## Getting data from the 2018-2022 5-year ACS
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|======================================================================| 100%
# Reformatting data
mydata <-
separate_wider_delim(mydata,
NAME,
delim = ", ",
names = c("County", "State"))
# Filtering data
mydata <- mydata %>%
filter(County == "Cheatham County"|
County == "Davidson County"|
County == "Robertson County"|
County == "Rutherford County"|
County == "Sumner County"|
County == "Williamson County"|
County == "Wilson County")
# Mapping data
mapdata <- mydata %>%
rename(Estimate = Estimate_E, Estimate_MOE = Estimate_M)
mapdata <- st_as_sf(mapdata)
mapviewOptions(basemaps.color.shuffle = FALSE)
mapview(mapdata, zcol = "Estimate",
layer.name = "Estimate",
popup = TRUE)
For Part 1, it is shown that Davidson County is well above every other county in terms of renters with 45.8% of households in the county being renter-occupied. Outside of Davidson, Rutherford is the next closest with 34.8% of households falling under that umbrella. Once you go past that, it dips below 30% for every other county. This shows that in and around Nashville and Murfreesboro, the renter households are higher while they are not as prevelant in the other areas.
# Install and load tidyverse
if (!require("tidyverse"))
install.packages("tidyverse")
library(tidyverse)
# Read the data
# NOTE: You may edit the URL to load a different dataset
mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/SocialData.csv")
head(mydata,10)
## ID Type Impressions
## 1 1 Photo 695
## 2 2 Text 940
## 3 3 Photo 1196
## 4 4 Photo 936
## 5 5 Photo 1389
## 6 6 Photo 857
## 7 7 Text 797
## 8 8 Photo 1810
## 9 9 Photo 1086
## 10 10 Video 1416
# Specify the DV and IV
# NOTE: You may edit the FGP and Team variable names
mydata$DV <- mydata$Impressions
mydata$IV <- mydata$Type
# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
geom_histogram() +
facet_grid(IV ~ .) +
geom_histogram(color = "black", fill = "#1f78b4") +
geom_vline(data = averages, aes(xintercept = mean, ))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
summarise(
count = n(),
mean = mean(DV, na.rm = TRUE),
sd = sd(DV, na.rm = TRUE),
min = min(DV, na.rm = TRUE),
max = max(DV, na.rm = TRUE))
## # A tibble: 3 × 6
## IV count mean sd min max
## <chr> <int> <dbl> <dbl> <int> <int>
## 1 Photo 58 1035. 297. 397 1810
## 2 Text 43 999. 278. 515 1746
## 3 Video 39 1370. 307. 829 1952
options(scipen = 999)
oneway.test(mydata$DV ~ mydata$IV,
var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: mydata$DV and mydata$IV
## F = 19.119, num df = 2.000, denom df = 85.525, p-value = 0.000000137
# If the ANOVA detects significant difference, run
# this post-hoc procedure to learn which
# group pairs differed significantly.
anova_1 <- aov(mydata$DV ~ mydata$IV)
TukeyHSD(anova_1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mydata$DV ~ mydata$IV)
##
## $`mydata$IV`
## diff lwr upr p adj
## Text-Photo -36.35605 -176.6202 103.9081 0.8126345
## Video-Photo 334.87710 190.5414 479.2128 0.0000005
## Video-Text 371.23315 217.1076 525.3587 0.0000002
For Part 2, the averages show that Video gets a 1370 impression average roughly, while Photo and Text are at 999 and 1035 respectively. This means that they are about the same average-wise while Video is well ahead of them. The charts display this as well as Video gets many more viewers than both Photo and Text, and has impression numbers approaching 2,000. Therefore, keeping the Videographer would be a wise move because he is bringing up social media engagement and impressions.
# Packages and libraries
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidytext")) install.packages("tidytext")
## Loading required package: tidytext
library(tidyverse)
library(ggplot2)
library(tidytext)
library(stringr) # Part of the tidyverse package
library(forcats) # Part of the tidyverse package
mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/WhiteHouse.csv")
tidy_text <- mydata %>%
unnest_tokens(word,Full.Text) %>%
count(Author,word, sort = TRUE)
# Total words by source
total_words <- tidy_text %>%
group_by(Author) %>%
summarize(total = sum(n))
# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)
## Joining with `by = join_by(Author)`
# Deleting data frames that are no longer needed
rm("total_words")
# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
bind_tf_idf(word, Author, n) %>%
arrange(desc(tf_idf))
options(scipen = 999)
tidy_text_tf_idf %>%
group_by(Author) %>%
slice_max(tf_idf, n = 12) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = Author)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Author, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
my_stopwords <- tibble(word = c("re"))
tidy_text <- tidy_text %>%
anti_join(my_stopwords)
## Joining with `by = join_by(word)`
# Total words by source
total_words <- tidy_text %>%
group_by(Author) %>%
summarize(total = sum(n))
# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)
## Joining with `by = join_by(Author, total)`
# Deleting data frames that are no longer needed
rm("total_words")
# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
bind_tf_idf(word, Author, n) %>%
arrange(desc(tf_idf))
options(scipen = 999)
tidy_text_tf_idf %>%
group_by(Author) %>%
slice_max(tf_idf, n = 12) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = Author)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Author, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
searchterms <- "Jobs|Employment|Work|Workers"
mydata$WhiteHouse <- ifelse(grepl(searchterms,
mydata$Full.Text,
ignore.case = TRUE),1,0)
sum(mydata$WhiteHouse)
## [1] 1020
For Part 3, I looked at the use of the term “jobs.” It was a pretty frequent term on the White House Twitter as it was #38 on the list of terms used on the platfrom. My combining that with employment, work, and workers I was able to find that it was used 1,020 times. This shows that the White House twitter mentions employment as one of their top priorities, and it gives us a window as to their potential priorities.