The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 1.0.1
## v tibble 3.1.6 v dplyr 1.0.10
## v tidyr 1.2.0 v stringr 1.5.0
## v readr 2.1.2 v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(mapdata)
# From Nick Climaco's NAFTA country population
country_url <- ("https://raw.githubusercontent.com/Nick-Climaco/Rdataset/main/nafta_countries.csv")
country_file <- data.frame(read.csv(country_url, header = TRUE))
# From Genesis Middleton county GDP data
county_url <-("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/gdp_by_county.csv")
county_file <- data.frame(read.csv(county_url, header = TRUE))
# Interest Rate Data
interest_rate_url <- ("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/interest_rate.csv")
interest_file <- data.frame(read.csv(interest_rate_url, header = TRUE, sep = ";"))
# view the country dataframe
head(country_file)
## ï..id Country X1994 X1995 X1996 X1997 X1998
## 1 1 Mexico 88314424 89969572 91586555 93183094 94767284
## 2 2 United States 263126000 266278000 269394000 272657000 275854000
## 3 3 Canada 29000663 29302311 29610218 29905948 30155173
## X1999 X2000
## 1 96334810 97873442
## 2 279040000 282162411
## 3 30401286 30685730
# view the county GDP dataframe
head(county_file)
## ï..FIPS Countyname Postal LineCode IndustryName
## 1 NA
## 2 NA
## 3 01001 Autauga AL 1 All Industries
## 4 01001 Autauga AL 2 Private goods-producing industries
## 5 01001 Autauga AL 3 Private services-providing industries
## 6 01001 Autauga AL 4 Government and government enterprises
## Gross.domestic.product..GDP..by.county X X.1 X.2
## 1 (thousands of dollars)
## 2 2012 2013 2014 2015
## 3 1383941 1363368 1402516 1539406
## 4 286396 310468 323582 346355
## 5 948490 904599 928438 1037309
## 6 149055 148301 150496 155742
# view the interest rate dataframe
head(interest_file)
## ï..date federal_reserve_system european_central_bank swiss_national_bank
## 1 1970-01-01 NA NA NA
## 2 1970-02-01 NA NA NA
## 3 1970-03-01 NA NA NA
## 4 1970-04-01 NA NA NA
## 5 1970-05-01 NA NA NA
## 6 1970-06-01 NA NA NA
## bank_england reserve_bank_australia bank_japan bank_brazil
## 1 8.0 NA NA NA
## 2 8.0 NA NA NA
## 3 8.0 NA NA NA
## 4 7.5 NA NA NA
## 5 7.0 NA NA NA
## 6 7.0 NA NA NA
# rename the column
colnames(country_file) <- c("ID", "Country", "1994", "1995", "1996", "1997", "1998", "1999", "2000")
# pivot longer the year and population
country_tidy_df <- country_file %>%
pivot_longer(c("1994", "1995", "1996", "1997", "1998", "1999", "2000"), names_to = "Year", values_to = "Population")
knitr::kable(country_tidy_df)
| ID | Country | Year | Population |
|---|---|---|---|
| 1 | Mexico | 1994 | 88314424 |
| 1 | Mexico | 1995 | 89969572 |
| 1 | Mexico | 1996 | 91586555 |
| 1 | Mexico | 1997 | 93183094 |
| 1 | Mexico | 1998 | 94767284 |
| 1 | Mexico | 1999 | 96334810 |
| 1 | Mexico | 2000 | 97873442 |
| 2 | United States | 1994 | 263126000 |
| 2 | United States | 1995 | 266278000 |
| 2 | United States | 1996 | 269394000 |
| 2 | United States | 1997 | 272657000 |
| 2 | United States | 1998 | 275854000 |
| 2 | United States | 1999 | 279040000 |
| 2 | United States | 2000 | 282162411 |
| 3 | Canada | 1994 | 29000663 |
| 3 | Canada | 1995 | 29302311 |
| 3 | Canada | 1996 | 29610218 |
| 3 | Canada | 1997 | 29905948 |
| 3 | Canada | 1998 | 30155173 |
| 3 | Canada | 1999 | 30401286 |
| 3 | Canada | 2000 | 30685730 |
# change the class of Year to numeric
country_tidy_df$Year <- as.integer(country_tidy_df$Year)
# calculate the population difference and rate of change
growth_rate <- country_tidy_df %>%
group_by(Country) %>%
arrange(Year) %>%
mutate(pop_difference = Population - lag(Population),
rate_percent = round((pop_difference/lag(Population)) * 100, digits = 2))
# rename the column
colnames(growth_rate) <- c("ID", "Country", "Year", "Population", "Population Difference", "Population % Change")
knitr::kable(growth_rate)
| ID | Country | Year | Population | Population Difference | Population % Change |
|---|---|---|---|---|---|
| 1 | Mexico | 1994 | 88314424 | NA | NA |
| 2 | United States | 1994 | 263126000 | NA | NA |
| 3 | Canada | 1994 | 29000663 | NA | NA |
| 1 | Mexico | 1995 | 89969572 | 1655148 | 1.87 |
| 2 | United States | 1995 | 266278000 | 3152000 | 1.20 |
| 3 | Canada | 1995 | 29302311 | 301648 | 1.04 |
| 1 | Mexico | 1996 | 91586555 | 1616983 | 1.80 |
| 2 | United States | 1996 | 269394000 | 3116000 | 1.17 |
| 3 | Canada | 1996 | 29610218 | 307907 | 1.05 |
| 1 | Mexico | 1997 | 93183094 | 1596539 | 1.74 |
| 2 | United States | 1997 | 272657000 | 3263000 | 1.21 |
| 3 | Canada | 1997 | 29905948 | 295730 | 1.00 |
| 1 | Mexico | 1998 | 94767284 | 1584190 | 1.70 |
| 2 | United States | 1998 | 275854000 | 3197000 | 1.17 |
| 3 | Canada | 1998 | 30155173 | 249225 | 0.83 |
| 1 | Mexico | 1999 | 96334810 | 1567526 | 1.65 |
| 2 | United States | 1999 | 279040000 | 3186000 | 1.15 |
| 3 | Canada | 1999 | 30401286 | 246113 | 0.82 |
| 1 | Mexico | 2000 | 97873442 | 1538632 | 1.60 |
| 2 | United States | 2000 | 282162411 | 3122411 | 1.12 |
| 3 | Canada | 2000 | 30685730 | 284444 | 0.94 |
# graph a line chart of rate change in population
growth_bar <- ggplot(data = subset(growth_rate, !is.na(`Population % Change`)), aes(x = Year, y = `Population % Change`, fill = `Country`)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = `Population % Change`), position = position_dodge(0.9), color = "black", vjust = 1, hjust = 0.5)
growth_bar
Based on the graph, it shows that while Mexico have the highest growth rate among them all, the rate is getting smaller over time. Meanwhile, the US growth rate remains steady.
# remove first 2 rows
county_tidy_df <- county_file[-(1:2),]
# remove last 3 rows
county_tidy_df <- county_tidy_df[-(12453:12458),]
# rename the column
colnames(county_tidy_df) <- c("FIPS", "County Name", "State", "Line Code", "Industry Name", "GDP in 2012 (thousands of dollars)", "GDP in 2013 (thousands of dollars)", "GDP in 2014 (thousands of dollars)", "GDP in 2015 (thousands of dollars)")
head(county_tidy_df, 10)
## FIPS County Name State Line Code Industry Name
## 3 01001 Autauga AL 1 All Industries
## 4 01001 Autauga AL 2 Private goods-producing industries
## 5 01001 Autauga AL 3 Private services-providing industries
## 6 01001 Autauga AL 4 Government and government enterprises
## 7 01003 Baldwin AL 1 All Industries
## 8 01003 Baldwin AL 2 Private goods-producing industries
## 9 01003 Baldwin AL 3 Private services-providing industries
## 10 01003 Baldwin AL 4 Government and government enterprises
## 11 01005 Barbour AL 1 All Industries
## 12 01005 Barbour AL 2 Private goods-producing industries
## GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3 1383941 1363368
## 4 286396 310468
## 5 948490 904599
## 6 149055 148301
## 7 5599194 6365080
## 8 681871 698500
## 9 4306173 5037325
## 10 611151 629255
## 11 639833 701750
## 12 253232 303099
## GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3 1402516 1539406
## 4 323582 346355
## 5 928438 1037309
## 6 150496 155742
## 7 6547396 6436107
## 8 711443 735432
## 9 5194273 5023903
## 10 641681 676772
## 11 689212 743779
## 12 286467 307299
# change the GDP column from char type to numeric
county_tidy_df[6:9] <- lapply(county_tidy_df[6:9],as.numeric)
## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion
## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion
## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion
## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion
# calculate the sum and average of all GDPs per county
industry_df <- county_tidy_df %>%
mutate(industry_total = rowSums(county_tidy_df[6:9], na.rm = TRUE),
industry_average_gdp = round(rowMeans(county_tidy_df[6:9], na.rm =TRUE), digits = 2),
High_GDP = apply(county_tidy_df[6:9], 1, max, na.rm = TRUE),
Low_GDP = apply(county_tidy_df[6:9], 1, min, na.rm = TRUE))
# rename the column
colnames(industry_df)[10] <- "Industry Total"
colnames(industry_df)[11] <- "Industry Avg GDP"
colnames(industry_df)[12] <- "Highest GDP in the County"
colnames(industry_df)[13] <- "Lowest GDP in the County"
head(industry_df)
## FIPS County Name State Line Code Industry Name
## 3 01001 Autauga AL 1 All Industries
## 4 01001 Autauga AL 2 Private goods-producing industries
## 5 01001 Autauga AL 3 Private services-providing industries
## 6 01001 Autauga AL 4 Government and government enterprises
## 7 01003 Baldwin AL 1 All Industries
## 8 01003 Baldwin AL 2 Private goods-producing industries
## GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3 1383941 1363368
## 4 286396 310468
## 5 948490 904599
## 6 149055 148301
## 7 5599194 6365080
## 8 681871 698500
## GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3 1402516 1539406
## 4 323582 346355
## 5 928438 1037309
## 6 150496 155742
## 7 6547396 6436107
## 8 711443 735432
## Industry Total Industry Avg GDP Highest GDP in the County
## 3 5689231 1422307.8 1539406
## 4 1266801 316700.2 346355
## 5 3818836 954709.0 1037309
## 6 603594 150898.5 155742
## 7 24947777 6236944.2 6547396
## 8 2827246 706811.5 735432
## Lowest GDP in the County
## 3 1363368
## 4 286396
## 5 904599
## 6 148301
## 7 5599194
## 8 681871
# summarize the industry GDP per State
industry_plot_data <- subset(industry_df, `Industry Name` == "All Industries") %>%
group_by(`State`) %>%
summarise("Total GDP" = sum(`Industry Total`),
"Highest GDP" = max(`Highest GDP in the County`),
"Lowest GDP" = min(`Lowest GDP in the County`),
)
# convert state abbreviations to name with a function [link]("https://rdrr.io/cran/usdata/src/R/abbr2state.R")
abbr2state <- function(abbr){
ab <- tolower(c("AL",
"AK", "AZ", "KS", "UT", "CO", "CT",
"DE", "FL", "GA", "HI", "ID", "IL",
"IN", "IA", "AR", "KY", "LA", "ME",
"MD", "MA", "MI", "MN", "MS", "MO",
"MT", "NE", "NV", "NH", "NJ", "NM",
"NY", "NC", "ND", "OH", "OK", "OR",
"PA", "RI", "SC", "SD", "TN", "TX",
"CA", "VT", "VA", "WA", "WV", "WI",
"WY", "DC"))
st <- c("alabama",
"alaska", "arizona", "kansas",
"utah", "colorado", "connecticut",
"delaware", "florida", "georgia",
"hawaii", "idaho", "illinois",
"indiana", "iowa", "arkansas",
"kentucky", "louisiana", "maine",
"maryland", "massachusetts", "michigan",
"minnesota", "mississippi", "missouri",
"montana", "nebraska", "nevada",
"new hampshire", "new jersey", "new mexico",
"new york", "north carolina", "north dakota",
"ohio", "oklahoma", "oregon",
"pennsylvania", "rhode island", "south carolina",
"south dakota", "tennessee", "texas",
"california", "vermont", "virginia",
"washington", "west virginia", "wisconsin",
"wyoming", "district of columbia")
st[match(tolower(abbr), ab)]
}
industry_plot_data["Full Name"] <- abbr2state(industry_plot_data$State)
head(industry_plot_data, 10)
## # A tibble: 10 x 5
## State `Total GDP` `Highest GDP` `Lowest GDP` `Full Name`
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 AK 220476432 27571441 23255 alaska
## 2 AL 772157204 42641286 118371 alabama
## 3 AR 458771584 29087938 73313 arkansas
## 4 AZ 1125178114 215381372 496754 arizona
## 5 CA 9360952426 691948578 48646 california
## 6 CO 1186164551 75738881 18005 colorado
## 7 CT 999163128 93157588 3919490 connecticut
## 8 DC 473056501 125434630 112736453 district of columbia
## 9 DE 260738432 49375769 5877245 delaware
## 10 FL 3304723391 141734334 135069 florida
# graph a map of the industry GDP
state <- map_data("state")
industry_map <- merge(state, industry_plot_data, by.x = "region", by.y = "Full Name")
industry_map <- arrange(industry_map, group, order)
ggplot(industry_map, aes(x = long, y = lat, group = group, fill = `Total GDP`)) +
geom_polygon(colour = "black")
In conclusion, it shows the total GDP in all industries in US and it appears that California have the highest GDP among all states.
# rename a column
colnames(interest_file) <- c("Date", "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil")
head(interest_file)
## Date Federal Reserve System European Central Bank Swiss National Bank
## 1 1970-01-01 NA NA NA
## 2 1970-02-01 NA NA NA
## 3 1970-03-01 NA NA NA
## 4 1970-04-01 NA NA NA
## 5 1970-05-01 NA NA NA
## 6 1970-06-01 NA NA NA
## Bank of England Reserve Bank of Australia Bank of Japan Bank of Brazil
## 1 8.0 NA NA NA
## 2 8.0 NA NA NA
## 3 8.0 NA NA NA
## 4 7.5 NA NA NA
## 5 7.0 NA NA NA
## 6 7.0 NA NA NA
# filter the interest rate starting from the 2000
twentyfirst_century_df <- interest_file[interest_file$Date >= "2000-01-01" & interest_file$Date <= "2022-12-31", ]
twentyfirst_century_df$Date <- as.Date(twentyfirst_century_df$Date)
# pivot longer the central banks and interest rate
interest_rate_in_21stdata <- twentyfirst_century_df %>%
pivot_longer(c( "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil"), names_to = "Central Bank Name", values_to = "Interest Rate")
head(interest_rate_in_21stdata)
## # A tibble: 6 x 3
## Date `Central Bank Name` `Interest Rate`
## <date> <chr> <dbl>
## 1 2000-01-01 Federal Reserve System NA
## 2 2000-01-01 European Central Bank 3
## 3 2000-01-01 Swiss National Bank NA
## 4 2000-01-01 Bank of England 5.5
## 5 2000-01-01 Reserve Bank of Australia 5
## 6 2000-01-01 Bank of Japan NA
ggplot(interest_rate_in_21stdata, aes(x = `Date`, y = `Interest Rate`, color = `Central Bank Name`)) +
geom_point() +
geom_hline(yintercept = 2, color = "red")
## Warning: Removed 5047 rows containing missing values (`geom_point()`).
In conclusion, the interest rate in 21st century shows the economic situation of every central bank’s home country. For economy to be in good health, the interest rate usually need to be in in the range of 0 to 2%. Given that, Bank of Brazil have high interest rate to combat their economic uncertainty. Other central banks such as Bank of England and European Central Bank have been dropping their interest rates which show economic improvement.