Contributors:

      Peter Fernandes
      Arushi Arora
          

Introduction

Project 2 requires creating 3 tidy datasets by either using the untidy datasets from week5 discussion or choose any of our own dataset. It requires the data set to be wide and untidy so that we read the data from a CSV and transform and tidy the datasets. we have used 3 of the datasets from the discussion and tried to transform and tidy the data. We have analysed the data over plots using the ggplot library.

Dataset 3- ‘Federal Spending’ by Alexis Mekueko

Analysis:

Calculate max spending by state and by agency Plot top 5 spending states and lowest 5 spending agencies

Import and Rename Variables in Federal Spending Data

# Read the csv file from GitHub with the table data
# Rename the columns to make them easier to use
df_csv <- read.csv("https://raw.githubusercontent.com/Araisedtotwo/Project02/main/DataUSACart.csv", header = TRUE) %>%
  as_tibble() %>%
  rename(id = ID.Geography , 
         agencyid= ID.Agency ,
         amount = Obligation.Amount..2017.) %>%
  print()
## # A tibble: 6,557 x 5
##    id       Geography         agencyid Agency                             amount
##    <chr>    <chr>             <chr>    <chr>                               <dbl>
##  1 04000US~ New Jersey        12C3     Natural Resources Conservation ~  -3.77e5
##  2 04000US~ District of Colu~ 1100     Executive Office of the Preside~   6.31e7
##  3 04000US~ New York          1100     Executive Office of the Preside~  -2.54e4
##  4 04000US~ Virginia          1100     Executive Office of the Preside~   1.14e5
##  5 04000US~ Arizona           1127     Office of the National Drug Con~   9.70e6
##  6 04000US~ California        1127     Office of the National Drug Con~   1.47e7
##  7 04000US~ Colorado          1127     Office of the National Drug Con~   1.42e7
##  8 04000US~ Delaware          1127     Office of the National Drug Con~   2.42e5
##  9 04000US~ Florida           1127     Office of the National Drug Con~   7.39e6
## 10 04000US~ Hawaii            1127     Office of the National Drug Con~   9.76e5
## # ... with 6,547 more rows
# 6557 obs and 5 vars

Creating AWS MySQL Connection & Reading Data from MySQL

#mysqlconnection = dbConnect(MySQL(), user='root', password='xxxx', dbname ='federal_spending',host='aws-movie-rating.cc7r4c0owv8u.us-east-1.rds.amazonaws.com')
#result = dbSendQuery(mysqlconnection,"SELECT `ID Geography` AS id,`Geography`,`ID Agency`AS agencyid ,`Agency`,`Obligation Amount (2017)` AS amount FROM federal_spending.fedspend")
#df=fetch(result,n= -1)
#head(df)

Clean data

# Remove amounts that are in -ve
df1 <- df_csv %>%
  filter(amount > 0)
head(df1)
## # A tibble: 6 x 5
##   id       Geography          agencyid Agency                             amount
##   <chr>    <chr>              <chr>    <chr>                               <dbl>
## 1 04000US~ District of Colum~ 1100     Executive Office of the Preside~   6.31e7
## 2 04000US~ Virginia           1100     Executive Office of the Preside~   1.14e5
## 3 04000US~ Arizona            1127     Office of the National Drug Con~   9.70e6
## 4 04000US~ California         1127     Office of the National Drug Con~   1.47e7
## 5 04000US~ Colorado           1127     Office of the National Drug Con~   1.42e7
## 6 04000US~ Delaware           1127     Office of the National Drug Con~   2.42e5
# 6158 obs and 5 vars

Let us sort data by spending amount and look at the summary of all variables

df2 <- df1 %>%
  arrange(amount)
summary(df2)
##       id             Geography           agencyid            Agency         
##  Length:6158        Length:6158        Length:6158        Length:6158       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      amount         
##  Min.   :1.030e+02  
##  1st Qu.:2.921e+05  
##  Median :3.233e+06  
##  Mean   :3.314e+08  
##  3rd Qu.:3.147e+07  
##  Max.   :9.858e+10

Calculate the overall spending by State

state_spending <- df2 %>% group_by(Geography) %>% summarize(sum_state = sum(amount)) %>%
  arrange(desc(sum_state, Geography))
## `summarise()` ungrouping output (override with `.groups` argument)
head(state_spending)
## # A tibble: 6 x 2
##   Geography        sum_state
##   <chr>                <dbl>
## 1 California   186831546017.
## 2 Pennsylvania 157053839877.
## 3 Florida      135029984070.
## 4 Texas        126263335225.
## 5 Indiana      107089641966.
## 6 New York      95698829951.
# California has the highest Federal Spending at $186,831,546,017

Plotting 5 highest spending states

df7 <- state_spending %>% top_n(5) 
## Selecting by sum_state
plot1 <- ggplot(df7, aes(Geography,sum_state )) + 
  geom_bar(aes(fill = Geography), position = "dodge", stat = "identity") + 
  geom_text(aes(label = sum_state), vjust = -.3) +
  ylab("Amount Spent") +
  ggtitle("5 highest spending states")

plot1

Calculate the overall spending by Agency

agency_spending <- df2 %>% group_by(Agency) %>% summarize(sum_agency = sum(amount)) %>%
  arrange(desc(sum_agency, Agency))
## `summarise()` ungrouping output (override with `.groups` argument)
head(agency_spending)
## # A tibble: 6 x 2
##   Agency                                                           sum_agency
##   <chr>                                                                 <dbl>
## 1 Centers for Medicare and Medicaid Services                    687665667131.
## 2 Social Security Administration                                606352947125.
## 3 Department of the Navy                                         92445245668.
## 4 Under Secretary for Benefits/Veterans Benefits Administration  77393474082 
## 5 Department of the Army                                         62684553714.
## 6 Department of the Air Force                                    52762096380.
# Centers for Medicare and Medicaid Services at $687,665,667,131 is the highest spending agency

Plotting 5 lowest spending agencies

df8 <- agency_spending %>% top_n(-5) 
## Selecting by sum_agency
plot2 <- ggplot(df8, aes(Agency,sum_agency/1000 )) + 
  geom_bar(aes(fill = Agency), position = "dodge", stat = "identity") + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())  +
  geom_text(aes(label = sum_agency*1000), vjust = -.3) +
  ylab("Amount Spent") +
  ggtitle("5 lowest spending agencies")

plot2