Final Exam: Data Skills

if (!require("tidyverse")) install.packages("tidyverse")

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

if (!require("tidycensus")) install.packages("tidycensus")

## Loading required package: tidycensus

if (!require("sf")) install.packages("sf")

## Loading required package: sf
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

if (!require("mapview")) install.packages("mapview")

## Loading required package: mapview

library(tidyverse)
library(tidycensus)
library(sf)
library(mapview)

# Transmitting API key

# census_api_key("33060ddff9e66f647b10ef7b540ab19f3dd5e9c4")

# Fetching ACS codebooks

DetailedTables <- load_variables(2022, "acs5", cache = TRUE)
SubjectTables <- load_variables(2022, "acs5/subject", cache = TRUE)
ProfileTables <- load_variables(2022, "acs5/profile", cache = TRUE)
All_ACS_Variables <- bind_rows(DetailedTables, ProfileTables)
All_ACS_Variables <- bind_rows(All_ACS_Variables, SubjectTables)
rm (DetailedTables, SubjectTables, ProfileTables)

# Specify a variable to estimate
VariableList = 
  c(Estimate_ = "DP04_0047P")


# Fetching data

mydata <- get_acs(
  geography = "county",
  state = "TN",
  variables = VariableList,
  year = 2022,
  survey = "acs5",
  output = "wide",
  geometry = TRUE)

## Getting data from the 2018-2022 5-year ACS

## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.

## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Using the ACS Data Profile

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |==========================================================            |  84%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |=============================================================         |  88%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |===============================================================       |  91%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |=================================================================     |  94%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |====================================================================  |  98%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%

# Reformatting data

mydata <-
  separate_wider_delim(mydata,
                       NAME,
                       delim = ", ",
                       names = c("County", "State"))

# Filtering data

mydata <- mydata %>% 
  filter(County == "Cheatham County"|
           County == "Davidson County"|
           County == "Robertson County"|
           County == "Rutherford County"|
           County == "Sumner County"|
           County == "Williamson County"|
           County == "Wilson County")

# Mapping data

mapdata <- mydata %>% 
  rename(Estimate = Estimate_E, Estimate_MOE = Estimate_M)

mapdata <- st_as_sf(mapdata)

mapviewOptions(basemaps.color.shuffle = FALSE)
mapview(mapdata, zcol = "Estimate",
        layer.name = "Estimate",
        popup = TRUE)

For Part 1, it is shown that Davidson County is well above every other county in terms of renters with 45.8% of households in the county being renter-occupied. Outside of Davidson, Rutherford is the next closest with 34.8% of households falling under that umbrella. Once you go past that, it dips below 30% for every other county. This shows that in and around Nashville and Murfreesboro, the renter households are higher while they are not as prevelant in the other areas.

# Install and load tidyverse
if (!require("tidyverse"))
  install.packages("tidyverse")
library(tidyverse)

# Read the data
# NOTE: You may edit the URL to load a different dataset

mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/SocialData.csv")
head(mydata,10)

##    ID  Type Impressions
## 1   1 Photo         695
## 2   2  Text         940
## 3   3 Photo        1196
## 4   4 Photo         936
## 5   5 Photo        1389
## 6   6 Photo         857
## 7   7  Text         797
## 8   8 Photo        1810
## 9   9 Photo        1086
## 10 10 Video        1416

# Specify the DV and IV
# NOTE: You may edit the FGP and Team variable names
mydata$DV <- mydata$Impressions
mydata$IV <- mydata$Type

# Graph the group distributions and averages
averages <- group_by(mydata, IV) %>%
  summarise(mean = mean(DV, na.rm = TRUE))
ggplot(mydata, aes(x = DV)) +
  geom_histogram() +
  facet_grid(IV ~ .) +
  geom_histogram(color = "black", fill = "#1f78b4") +
  geom_vline(data = averages, aes(xintercept = mean, ))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Calculate and show the group counts, means, standard
# deviations, minimums, and maximums
group_by(mydata, IV) %>%
  summarise(
    count = n(),
    mean = mean(DV, na.rm = TRUE),
    sd = sd(DV, na.rm = TRUE),
    min = min(DV, na.rm = TRUE),
    max = max(DV, na.rm = TRUE))

## # A tibble: 3 × 6
##   IV    count  mean    sd   min   max
##   <chr> <int> <dbl> <dbl> <int> <int>
## 1 Photo    58 1035.  297.   397  1810
## 2 Text     43  999.  278.   515  1746
## 3 Video    39 1370.  307.   829  1952

options(scipen = 999)
oneway.test(mydata$DV ~ mydata$IV,
            var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  mydata$DV and mydata$IV
## F = 19.119, num df = 2.000, denom df = 85.525, p-value = 0.000000137

# If the ANOVA detects significant difference, run
# this post-hoc procedure to learn which
# group pairs differed significantly.

anova_1 <- aov(mydata$DV ~ mydata$IV)
TukeyHSD(anova_1)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mydata$DV ~ mydata$IV)
## 
## $`mydata$IV`
##                  diff       lwr      upr     p adj
## Text-Photo  -36.35605 -176.6202 103.9081 0.8126345
## Video-Photo 334.87710  190.5414 479.2128 0.0000005
## Video-Text  371.23315  217.1076 525.3587 0.0000002

For Part 2, the averages show that Video gets a 1370 impression average roughly, while Photo and Text are at 999 and 1035 respectively. This means that they are about the same average-wise while Video is well ahead of them. The charts display this as well as Video gets many more viewers than both Photo and Text, and has impression numbers approaching 2,000. Therefore, keeping the Videographer would be a wise move because he is bringing up social media engagement and impressions.

# Packages and libraries
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidytext")) install.packages("tidytext")

## Loading required package: tidytext

library(tidyverse)
library(ggplot2)
library(tidytext)
library(stringr) # Part of the tidyverse package
library(forcats) # Part of the tidyverse package

mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/WhiteHouse.csv")

tidy_text <- mydata %>%
  unnest_tokens(word,Full.Text) %>%
  count(Author,word, sort = TRUE)

# Total words by source
total_words <- tidy_text %>% 
  group_by(Author) %>% 
  summarize(total = sum(n))

# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)

## Joining with `by = join_by(Author)`

# Deleting data frames that are no longer needed
rm("total_words")

# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
  bind_tf_idf(word, Author, n) %>%
  arrange(desc(tf_idf))

options(scipen = 999)
tidy_text_tf_idf %>%
  group_by(Author) %>%
  slice_max(tf_idf, n = 12) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = Author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Author, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

my_stopwords <- tibble(word = c("re"))
tidy_text <- tidy_text %>%
  anti_join(my_stopwords)

## Joining with `by = join_by(word)`

# Total words by source
total_words <- tidy_text %>% 
  group_by(Author) %>% 
  summarize(total = sum(n))

# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)

## Joining with `by = join_by(Author, total)`

# Deleting data frames that are no longer needed
rm("total_words")

# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
  bind_tf_idf(word, Author, n) %>%
  arrange(desc(tf_idf))

options(scipen = 999)
tidy_text_tf_idf %>%
  group_by(Author) %>%
  slice_max(tf_idf, n = 12) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = Author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Author, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

searchterms <- "Jobs|Employment|Work|Workers"
mydata$WhiteHouse <- ifelse(grepl(searchterms,
                             mydata$Full.Text,
                             ignore.case = TRUE),1,0)
sum(mydata$WhiteHouse)

## [1] 1020

For Part 3, I looked at the use of the term “jobs.” It was a pretty frequent term on the White House Twitter as it was #38 on the list of terms used on the platfrom. My combining that with employment, work, and workers I was able to find that it was used 1,020 times. This shows that the White House twitter mentions employment as one of their top priorities, and it gives us a window as to their potential priorities.

Final Exam: Data Skills

Dylan Simmons

2024-04-22