Contributors:
Peter Fernandes
Arushi Arora
Project 2 requires creating 3 tidy datasets by either using the untidy datasets from week5 discussion or choose any of our own dataset. It requires the data set to be wide and untidy so that we read the data from a CSV and transform and tidy the datasets. we have used 3 of the datasets from the discussion and tried to transform and tidy the data. We have analysed the data over plots using the ggplot library.
Analysis:
Calculate max spending by state and by agency Plot top 5 spending states and lowest 5 spending agencies
# Read the csv file from GitHub with the table data
# Rename the columns to make them easier to use
df_csv <- read.csv("https://raw.githubusercontent.com/Araisedtotwo/Project02/main/DataUSACart.csv", header = TRUE) %>%
as_tibble() %>%
rename(id = ID.Geography ,
agencyid= ID.Agency ,
amount = Obligation.Amount..2017.) %>%
print()
## # A tibble: 6,557 x 5
## id Geography agencyid Agency amount
## <chr> <chr> <chr> <chr> <dbl>
## 1 04000US~ New Jersey 12C3 Natural Resources Conservation ~ -3.77e5
## 2 04000US~ District of Colu~ 1100 Executive Office of the Preside~ 6.31e7
## 3 04000US~ New York 1100 Executive Office of the Preside~ -2.54e4
## 4 04000US~ Virginia 1100 Executive Office of the Preside~ 1.14e5
## 5 04000US~ Arizona 1127 Office of the National Drug Con~ 9.70e6
## 6 04000US~ California 1127 Office of the National Drug Con~ 1.47e7
## 7 04000US~ Colorado 1127 Office of the National Drug Con~ 1.42e7
## 8 04000US~ Delaware 1127 Office of the National Drug Con~ 2.42e5
## 9 04000US~ Florida 1127 Office of the National Drug Con~ 7.39e6
## 10 04000US~ Hawaii 1127 Office of the National Drug Con~ 9.76e5
## # ... with 6,547 more rows
# 6557 obs and 5 vars
#mysqlconnection = dbConnect(MySQL(), user='root', password='xxxx', dbname ='federal_spending',host='aws-movie-rating.cc7r4c0owv8u.us-east-1.rds.amazonaws.com')
#result = dbSendQuery(mysqlconnection,"SELECT `ID Geography` AS id,`Geography`,`ID Agency`AS agencyid ,`Agency`,`Obligation Amount (2017)` AS amount FROM federal_spending.fedspend")
#df=fetch(result,n= -1)
#head(df)
# Remove amounts that are in -ve
df1 <- df_csv %>%
filter(amount > 0)
head(df1)
## # A tibble: 6 x 5
## id Geography agencyid Agency amount
## <chr> <chr> <chr> <chr> <dbl>
## 1 04000US~ District of Colum~ 1100 Executive Office of the Preside~ 6.31e7
## 2 04000US~ Virginia 1100 Executive Office of the Preside~ 1.14e5
## 3 04000US~ Arizona 1127 Office of the National Drug Con~ 9.70e6
## 4 04000US~ California 1127 Office of the National Drug Con~ 1.47e7
## 5 04000US~ Colorado 1127 Office of the National Drug Con~ 1.42e7
## 6 04000US~ Delaware 1127 Office of the National Drug Con~ 2.42e5
# 6158 obs and 5 vars
df2 <- df1 %>%
arrange(amount)
summary(df2)
## id Geography agencyid Agency
## Length:6158 Length:6158 Length:6158 Length:6158
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## amount
## Min. :1.030e+02
## 1st Qu.:2.921e+05
## Median :3.233e+06
## Mean :3.314e+08
## 3rd Qu.:3.147e+07
## Max. :9.858e+10
state_spending <- df2 %>% group_by(Geography) %>% summarize(sum_state = sum(amount)) %>%
arrange(desc(sum_state, Geography))
## `summarise()` ungrouping output (override with `.groups` argument)
head(state_spending)
## # A tibble: 6 x 2
## Geography sum_state
## <chr> <dbl>
## 1 California 186831546017.
## 2 Pennsylvania 157053839877.
## 3 Florida 135029984070.
## 4 Texas 126263335225.
## 5 Indiana 107089641966.
## 6 New York 95698829951.
# California has the highest Federal Spending at $186,831,546,017
df7 <- state_spending %>% top_n(5)
## Selecting by sum_state
plot1 <- ggplot(df7, aes(Geography,sum_state )) +
geom_bar(aes(fill = Geography), position = "dodge", stat = "identity") +
geom_text(aes(label = sum_state), vjust = -.3) +
ylab("Amount Spent") +
ggtitle("5 highest spending states")
plot1
agency_spending <- df2 %>% group_by(Agency) %>% summarize(sum_agency = sum(amount)) %>%
arrange(desc(sum_agency, Agency))
## `summarise()` ungrouping output (override with `.groups` argument)
head(agency_spending)
## # A tibble: 6 x 2
## Agency sum_agency
## <chr> <dbl>
## 1 Centers for Medicare and Medicaid Services 687665667131.
## 2 Social Security Administration 606352947125.
## 3 Department of the Navy 92445245668.
## 4 Under Secretary for Benefits/Veterans Benefits Administration 77393474082
## 5 Department of the Army 62684553714.
## 6 Department of the Air Force 52762096380.
# Centers for Medicare and Medicaid Services at $687,665,667,131 is the highest spending agency
df8 <- agency_spending %>% top_n(-5)
## Selecting by sum_agency
plot2 <- ggplot(df8, aes(Agency,sum_agency/1000 )) +
geom_bar(aes(fill = Agency), position = "dodge", stat = "identity") + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
geom_text(aes(label = sum_agency*1000), vjust = -.3) +
ylab("Amount Spent") +
ggtitle("5 lowest spending agencies")
plot2