Project 2 - Tidying Up Data for Downstream Analysis

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work

Load Packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)

Load Datasets from different sources to seperate dataframes

# From Nick Climaco's NAFTA country population
country_url <- ("https://raw.githubusercontent.com/Nick-Climaco/Rdataset/main/nafta_countries.csv")
country_file <- data.frame(read.csv(country_url, header = TRUE))

# From Genesis Middleton county GDP data
county_url <-("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/gdp_by_county.csv")
county_file <- data.frame(read.csv(county_url, header = TRUE))

# Interest Rate Data
interest_rate_url <- ("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/interest_rate.csv")
interest_file <- data.frame(read.csv(interest_rate_url, header = TRUE, sep = ";"))

View all datasets

# view the country dataframe
head(country_file)
##   ï..id       Country     X1994     X1995     X1996     X1997     X1998
## 1     1        Mexico  88314424  89969572  91586555  93183094  94767284
## 2     2 United States 263126000 266278000 269394000 272657000 275854000
## 3     3        Canada  29000663  29302311  29610218  29905948  30155173
##       X1999     X2000
## 1  96334810  97873442
## 2 279040000 282162411
## 3  30401286  30685730
# view the county GDP dataframe
head(county_file)
##   ï..FIPS Countyname Postal LineCode                            IndustryName
## 1                                 NA                                        
## 2                                 NA                                        
## 3   01001    Autauga     AL        1                          All Industries
## 4   01001    Autauga     AL        2      Private goods-producing industries
## 5   01001    Autauga     AL        3   Private services-providing industries
## 6   01001    Autauga     AL        4   Government and government enterprises
##   Gross.domestic.product..GDP..by.county       X     X.1     X.2
## 1                 (thousands of dollars)                        
## 2                                   2012    2013    2014    2015
## 3                                1383941 1363368 1402516 1539406
## 4                                 286396  310468  323582  346355
## 5                                 948490  904599  928438 1037309
## 6                                 149055  148301  150496  155742
# view the interest rate dataframe
head(interest_file)
##      ï..date federal_reserve_system european_central_bank swiss_national_bank
## 1 1970-01-01                     NA                    NA                  NA
## 2 1970-02-01                     NA                    NA                  NA
## 3 1970-03-01                     NA                    NA                  NA
## 4 1970-04-01                     NA                    NA                  NA
## 5 1970-05-01                     NA                    NA                  NA
## 6 1970-06-01                     NA                    NA                  NA
##   bank_england reserve_bank_australia bank_japan bank_brazil
## 1          8.0                     NA         NA          NA
## 2          8.0                     NA         NA          NA
## 3          8.0                     NA         NA          NA
## 4          7.5                     NA         NA          NA
## 5          7.0                     NA         NA          NA
## 6          7.0                     NA         NA          NA

Data Cleaning on the NAFTA Population Dataset

# rename the column 
colnames(country_file) <- c("ID", "Country", "1994", "1995", "1996", "1997", "1998", "1999", "2000")

# pivot longer the year and population
country_tidy_df <- country_file %>%
  pivot_longer(c("1994", "1995", "1996", "1997", "1998", "1999", "2000"), names_to = "Year", values_to = "Population")

knitr::kable(country_tidy_df)
ID Country Year Population
1 Mexico 1994 88314424
1 Mexico 1995 89969572
1 Mexico 1996 91586555
1 Mexico 1997 93183094
1 Mexico 1998 94767284
1 Mexico 1999 96334810
1 Mexico 2000 97873442
2 United States 1994 263126000
2 United States 1995 266278000
2 United States 1996 269394000
2 United States 1997 272657000
2 United States 1998 275854000
2 United States 1999 279040000
2 United States 2000 282162411
3 Canada 1994 29000663
3 Canada 1995 29302311
3 Canada 1996 29610218
3 Canada 1997 29905948
3 Canada 1998 30155173
3 Canada 1999 30401286
3 Canada 2000 30685730

Data Analysis on the NAFTA Population Dataset

# change the class of Year to numeric
country_tidy_df$Year <- as.integer(country_tidy_df$Year)

# calculate the population difference and rate of change
growth_rate <- country_tidy_df %>%
  group_by(Country) %>%
  arrange(Year) %>%
  mutate(pop_difference = Population - lag(Population),
         rate_percent = round((pop_difference/lag(Population)) * 100, digits = 2))

# rename the column
colnames(growth_rate) <- c("ID", "Country", "Year", "Population", "Population Difference", "Population % Change")

knitr::kable(growth_rate)
ID Country Year Population Population Difference Population % Change
1 Mexico 1994 88314424 NA NA
2 United States 1994 263126000 NA NA
3 Canada 1994 29000663 NA NA
1 Mexico 1995 89969572 1655148 1.87
2 United States 1995 266278000 3152000 1.20
3 Canada 1995 29302311 301648 1.04
1 Mexico 1996 91586555 1616983 1.80
2 United States 1996 269394000 3116000 1.17
3 Canada 1996 29610218 307907 1.05
1 Mexico 1997 93183094 1596539 1.74
2 United States 1997 272657000 3263000 1.21
3 Canada 1997 29905948 295730 1.00
1 Mexico 1998 94767284 1584190 1.70
2 United States 1998 275854000 3197000 1.17
3 Canada 1998 30155173 249225 0.83
1 Mexico 1999 96334810 1567526 1.65
2 United States 1999 279040000 3186000 1.15
3 Canada 1999 30401286 246113 0.82
1 Mexico 2000 97873442 1538632 1.60
2 United States 2000 282162411 3122411 1.12
3 Canada 2000 30685730 284444 0.94
# graph a line chart of rate change in population
growth_bar <- ggplot(data = subset(growth_rate, !is.na(`Population % Change`)), aes(x = Year, y = `Population % Change`, fill = `Country`)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = `Population % Change`), position = position_dodge(0.9), color = "black", vjust = 1, hjust = 0.5)

growth_bar

Conclusion in the NAFTA Population Dataset

Based on the graph, it shows that while Mexico have the highest growth rate among them all, the rate is getting smaller over time. Meanwhile, the US growth rate remains steady.

Data Cleaning on the County GDP Dataset

# remove first 2 rows
county_tidy_df <- county_file[-(1:2),]

# remove last 3 rows
county_tidy_df <- county_tidy_df[-(12453:12458),]

# rename the column
colnames(county_tidy_df) <- c("FIPS", "County Name", "State", "Line Code", "Industry Name", "GDP in 2012 (thousands of dollars)", "GDP in 2013 (thousands of dollars)", "GDP in 2014 (thousands of dollars)", "GDP in 2015 (thousands of dollars)")

head(county_tidy_df, 10)
##     FIPS County Name State Line Code                           Industry Name
## 3  01001     Autauga    AL         1                          All Industries
## 4  01001     Autauga    AL         2      Private goods-producing industries
## 5  01001     Autauga    AL         3   Private services-providing industries
## 6  01001     Autauga    AL         4   Government and government enterprises
## 7  01003     Baldwin    AL         1                          All Industries
## 8  01003     Baldwin    AL         2      Private goods-producing industries
## 9  01003     Baldwin    AL         3   Private services-providing industries
## 10 01003     Baldwin    AL         4   Government and government enterprises
## 11 01005     Barbour    AL         1                          All Industries
## 12 01005     Barbour    AL         2      Private goods-producing industries
##    GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3                             1383941                            1363368
## 4                              286396                             310468
## 5                              948490                             904599
## 6                              149055                             148301
## 7                             5599194                            6365080
## 8                              681871                             698500
## 9                             4306173                            5037325
## 10                             611151                             629255
## 11                             639833                             701750
## 12                             253232                             303099
##    GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3                             1402516                            1539406
## 4                              323582                             346355
## 5                              928438                            1037309
## 6                              150496                             155742
## 7                             6547396                            6436107
## 8                              711443                             735432
## 9                             5194273                            5023903
## 10                             641681                             676772
## 11                             689212                             743779
## 12                             286467                             307299

Data Analysis on the County GDP Dataset

# change the GDP column from char type to numeric
county_tidy_df[6:9] <- lapply(county_tidy_df[6:9],as.numeric)
## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion
# calculate the sum and average of all GDPs per county
industry_df <- county_tidy_df %>%
  mutate(industry_total = rowSums(county_tidy_df[6:9], na.rm = TRUE),
         industry_average_gdp = round(rowMeans(county_tidy_df[6:9], na.rm =TRUE), digits = 2),
         High_GDP = apply(county_tidy_df[6:9], 1, max, na.rm = TRUE),
         Low_GDP = apply(county_tidy_df[6:9], 1, min, na.rm = TRUE))

# rename the column
colnames(industry_df)[10] <- "Industry Total"
colnames(industry_df)[11] <- "Industry Avg GDP"
colnames(industry_df)[12] <- "Highest GDP in the County"
colnames(industry_df)[13] <- "Lowest GDP in the County"

head(industry_df)
##    FIPS County Name State Line Code                           Industry Name
## 3 01001     Autauga    AL         1                          All Industries
## 4 01001     Autauga    AL         2      Private goods-producing industries
## 5 01001     Autauga    AL         3   Private services-providing industries
## 6 01001     Autauga    AL         4   Government and government enterprises
## 7 01003     Baldwin    AL         1                          All Industries
## 8 01003     Baldwin    AL         2      Private goods-producing industries
##   GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3                            1383941                            1363368
## 4                             286396                             310468
## 5                             948490                             904599
## 6                             149055                             148301
## 7                            5599194                            6365080
## 8                             681871                             698500
##   GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3                            1402516                            1539406
## 4                             323582                             346355
## 5                             928438                            1037309
## 6                             150496                             155742
## 7                            6547396                            6436107
## 8                             711443                             735432
##   Industry Total Industry Avg GDP Highest GDP in the County
## 3        5689231        1422307.8                   1539406
## 4        1266801         316700.2                    346355
## 5        3818836         954709.0                   1037309
## 6         603594         150898.5                    155742
## 7       24947777        6236944.2                   6547396
## 8        2827246         706811.5                    735432
##   Lowest GDP in the County
## 3                  1363368
## 4                   286396
## 5                   904599
## 6                   148301
## 7                  5599194
## 8                   681871
# summarize the industry GDP per State
industry_plot_data <- subset(industry_df, `Industry Name` == "All Industries") %>%
  group_by(`State`) %>%
  summarise("Total GDP" = sum(`Industry Total`),
            "Highest GDP" = max(`Highest GDP in the County`),
            "Lowest GDP" = min(`Lowest GDP in the County`),
            )
# convert state abbreviations to name with a function [link]("https://rdrr.io/cran/usdata/src/R/abbr2state.R")

abbr2state <- function(abbr){
  ab    <- tolower(c("AL",
             "AK", "AZ", "KS", "UT", "CO", "CT",
             "DE", "FL", "GA", "HI", "ID", "IL",
             "IN", "IA", "AR", "KY", "LA", "ME",
             "MD", "MA", "MI", "MN", "MS", "MO",
             "MT", "NE", "NV", "NH", "NJ", "NM",
             "NY", "NC", "ND", "OH", "OK", "OR",
             "PA", "RI", "SC", "SD", "TN", "TX",
             "CA", "VT", "VA", "WA", "WV", "WI",
             "WY", "DC"))
  st    <- c("alabama",
             "alaska", "arizona", "kansas",
             "utah", "colorado", "connecticut",
             "delaware", "florida", "georgia",
             "hawaii", "idaho", "illinois",
             "indiana", "iowa", "arkansas",
             "kentucky", "louisiana", "maine",
             "maryland", "massachusetts", "michigan",
             "minnesota", "mississippi", "missouri",
             "montana", "nebraska", "nevada",
             "new hampshire", "new jersey", "new mexico",
             "new york", "north carolina", "north dakota",
             "ohio", "oklahoma", "oregon",
             "pennsylvania", "rhode island", "south carolina",
             "south dakota", "tennessee", "texas",
             "california", "vermont", "virginia",
             "washington", "west virginia", "wisconsin",
             "wyoming", "district of columbia")
  st[match(tolower(abbr), ab)]
}

industry_plot_data["Full Name"] <- abbr2state(industry_plot_data$State)

head(industry_plot_data, 10)
## # A tibble: 10 x 5
##    State `Total GDP` `Highest GDP` `Lowest GDP` `Full Name`         
##    <chr>       <dbl>         <dbl>        <dbl> <chr>               
##  1 AK      220476432      27571441        23255 alaska              
##  2 AL      772157204      42641286       118371 alabama             
##  3 AR      458771584      29087938        73313 arkansas            
##  4 AZ     1125178114     215381372       496754 arizona             
##  5 CA     9360952426     691948578        48646 california          
##  6 CO     1186164551      75738881        18005 colorado            
##  7 CT      999163128      93157588      3919490 connecticut         
##  8 DC      473056501     125434630    112736453 district of columbia
##  9 DE      260738432      49375769      5877245 delaware            
## 10 FL     3304723391     141734334       135069 florida
# graph a map of the industry GDP
state <- map_data("state")
industry_map <- merge(state, industry_plot_data, by.x = "region", by.y = "Full Name")
industry_map <- arrange(industry_map, group, order)

ggplot(industry_map, aes(x = long, y = lat, group = group, fill = `Total GDP`)) +
  geom_polygon(colour = "black")

Conclusion on the County GDP Dataset

In conclusion, it shows the total GDP in all industries in US and it appears that California have the highest GDP among all states.

Data Cleaning on the Interest Rate Dataset

# rename a column
colnames(interest_file) <- c("Date", "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil")

head(interest_file)
##         Date Federal Reserve System European Central Bank Swiss National Bank
## 1 1970-01-01                     NA                    NA                  NA
## 2 1970-02-01                     NA                    NA                  NA
## 3 1970-03-01                     NA                    NA                  NA
## 4 1970-04-01                     NA                    NA                  NA
## 5 1970-05-01                     NA                    NA                  NA
## 6 1970-06-01                     NA                    NA                  NA
##   Bank of England Reserve Bank of Australia Bank of Japan Bank of Brazil
## 1             8.0                        NA            NA             NA
## 2             8.0                        NA            NA             NA
## 3             8.0                        NA            NA             NA
## 4             7.5                        NA            NA             NA
## 5             7.0                        NA            NA             NA
## 6             7.0                        NA            NA             NA

Data Analysis on the Interest Rate Dataset

# filter the interest rate starting from the 2000
twentyfirst_century_df <- interest_file[interest_file$Date >= "2000-01-01" & interest_file$Date <= "2022-12-31", ]
twentyfirst_century_df$Date <- as.Date(twentyfirst_century_df$Date)

# pivot longer the central banks and interest rate
interest_rate_in_21stdata <- twentyfirst_century_df %>%
  pivot_longer(c( "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil"), names_to = "Central Bank Name", values_to = "Interest Rate")

head(interest_rate_in_21stdata)
## # A tibble: 6 x 3
##   Date       `Central Bank Name`       `Interest Rate`
##   <date>     <chr>                               <dbl>
## 1 2000-01-01 Federal Reserve System               NA  
## 2 2000-01-01 European Central Bank                 3  
## 3 2000-01-01 Swiss National Bank                  NA  
## 4 2000-01-01 Bank of England                       5.5
## 5 2000-01-01 Reserve Bank of Australia             5  
## 6 2000-01-01 Bank of Japan                        NA
ggplot(interest_rate_in_21stdata, aes(x = `Date`, y = `Interest Rate`, color = `Central Bank Name`)) +
  geom_point() +
  geom_hline(yintercept = 2, color = "red")
## Warning: Removed 5047 rows containing missing values (`geom_point()`).

Conclusion on the Interest Rate Dataset

In conclusion, the interest rate in 21st century shows the economic situation of every central bank’s home country. For economy to be in good health, the interest rate usually need to be in in the range of 0 to 2%. Given that, Bank of Brazil have high interest rate to combat their economic uncertainty. Other central banks such as Bank of England and European Central Bank have been dropping their interest rates which show economic improvement.