607_project

Project 2 - Tidying Up Data for Downstream Analysis

The goal of this assignment is to give you practice in preparing different datasets for downstream analysis work

Load Packages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(maps)

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

library(mapdata)

Load Datasets from different sources to seperate dataframes

# From Nick Climaco's NAFTA country population
country_url <- ("https://raw.githubusercontent.com/Nick-Climaco/Rdataset/main/nafta_countries.csv")
country_file <- data.frame(read.csv(country_url, header = TRUE))

# From Genesis Middleton county GDP data
county_url <-("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/gdp_by_county.csv")
county_file <- data.frame(read.csv(county_url, header = TRUE))

# Interest Rate Data
interest_rate_url <- ("https://raw.githubusercontent.com/eddiexunyc/607_project_2/main/Resources/interest_rate.csv")
interest_file <- data.frame(read.csv(interest_rate_url, header = TRUE, sep = ";"))

View all datasets

# view the country dataframe
head(country_file)

##   ï..id       Country     X1994     X1995     X1996     X1997     X1998
## 1     1        Mexico  88314424  89969572  91586555  93183094  94767284
## 2     2 United States 263126000 266278000 269394000 272657000 275854000
## 3     3        Canada  29000663  29302311  29610218  29905948  30155173
##       X1999     X2000
## 1  96334810  97873442
## 2 279040000 282162411
## 3  30401286  30685730

# view the county GDP dataframe
head(county_file)

##   ï..FIPS Countyname Postal LineCode                            IndustryName
## 1                                 NA                                        
## 2                                 NA                                        
## 3   01001    Autauga     AL        1                          All Industries
## 4   01001    Autauga     AL        2      Private goods-producing industries
## 5   01001    Autauga     AL        3   Private services-providing industries
## 6   01001    Autauga     AL        4   Government and government enterprises
##   Gross.domestic.product..GDP..by.county       X     X.1     X.2
## 1                 (thousands of dollars)                        
## 2                                   2012    2013    2014    2015
## 3                                1383941 1363368 1402516 1539406
## 4                                 286396  310468  323582  346355
## 5                                 948490  904599  928438 1037309
## 6                                 149055  148301  150496  155742

# view the interest rate dataframe
head(interest_file)

##      ï..date federal_reserve_system european_central_bank swiss_national_bank
## 1 1970-01-01                     NA                    NA                  NA
## 2 1970-02-01                     NA                    NA                  NA
## 3 1970-03-01                     NA                    NA                  NA
## 4 1970-04-01                     NA                    NA                  NA
## 5 1970-05-01                     NA                    NA                  NA
## 6 1970-06-01                     NA                    NA                  NA
##   bank_england reserve_bank_australia bank_japan bank_brazil
## 1          8.0                     NA         NA          NA
## 2          8.0                     NA         NA          NA
## 3          8.0                     NA         NA          NA
## 4          7.5                     NA         NA          NA
## 5          7.0                     NA         NA          NA
## 6          7.0                     NA         NA          NA

Data Cleaning on the NAFTA Population Dataset

# rename the column 
colnames(country_file) <- c("ID", "Country", "1994", "1995", "1996", "1997", "1998", "1999", "2000")

# pivot longer the year and population
country_tidy_df <- country_file %>%
  pivot_longer(c("1994", "1995", "1996", "1997", "1998", "1999", "2000"), names_to = "Year", values_to = "Population")

knitr::kable(country_tidy_df)

ID	Country	Year	Population
1	Mexico	1994	88314424
1	Mexico	1995	89969572
1	Mexico	1996	91586555
1	Mexico	1997	93183094
1	Mexico	1998	94767284
1	Mexico	1999	96334810
1	Mexico	2000	97873442
2	United States	1994	263126000
2	United States	1995	266278000
2	United States	1996	269394000
2	United States	1997	272657000
2	United States	1998	275854000
2	United States	1999	279040000
2	United States	2000	282162411
3	Canada	1994	29000663
3	Canada	1995	29302311
3	Canada	1996	29610218
3	Canada	1997	29905948
3	Canada	1998	30155173
3	Canada	1999	30401286
3	Canada	2000	30685730

Data Analysis on the NAFTA Population Dataset

# change the class of Year to numeric
country_tidy_df$Year <- as.integer(country_tidy_df$Year)

# calculate the population difference and rate of change
growth_rate <- country_tidy_df %>%
  group_by(Country) %>%
  arrange(Year) %>%
  mutate(pop_difference = Population - lag(Population),
         rate_percent = round((pop_difference/lag(Population)) * 100, digits = 2))

# rename the column
colnames(growth_rate) <- c("ID", "Country", "Year", "Population", "Population Difference", "Population % Change")

knitr::kable(growth_rate)

ID	Country	Year	Population	Population Difference	Population % Change
1	Mexico	1994	88314424	NA	NA
2	United States	1994	263126000	NA	NA
3	Canada	1994	29000663	NA	NA
1	Mexico	1995	89969572	1655148	1.87
2	United States	1995	266278000	3152000	1.20
3	Canada	1995	29302311	301648	1.04
1	Mexico	1996	91586555	1616983	1.80
2	United States	1996	269394000	3116000	1.17
3	Canada	1996	29610218	307907	1.05
1	Mexico	1997	93183094	1596539	1.74
2	United States	1997	272657000	3263000	1.21
3	Canada	1997	29905948	295730	1.00
1	Mexico	1998	94767284	1584190	1.70
2	United States	1998	275854000	3197000	1.17
3	Canada	1998	30155173	249225	0.83
1	Mexico	1999	96334810	1567526	1.65
2	United States	1999	279040000	3186000	1.15
3	Canada	1999	30401286	246113	0.82
1	Mexico	2000	97873442	1538632	1.60
2	United States	2000	282162411	3122411	1.12
3	Canada	2000	30685730	284444	0.94

# graph a line chart of rate change in population
growth_bar <- ggplot(data = subset(growth_rate, !is.na(`Population % Change`)), aes(x = Year, y = `Population % Change`, fill = `Country`)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = `Population % Change`), position = position_dodge(0.9), color = "black", vjust = 1, hjust = 0.5)

growth_bar

Conclusion in the NAFTA Population Dataset

Based on the graph, it shows that while Mexico have the highest growth rate among them all, the rate is getting smaller over time. Meanwhile, the US growth rate remains steady.

Data Cleaning on the County GDP Dataset

# remove first 2 rows
county_tidy_df <- county_file[-(1:2),]

# remove last 3 rows
county_tidy_df <- county_tidy_df[-(12453:12458),]

# rename the column
colnames(county_tidy_df) <- c("FIPS", "County Name", "State", "Line Code", "Industry Name", "GDP in 2012 (thousands of dollars)", "GDP in 2013 (thousands of dollars)", "GDP in 2014 (thousands of dollars)", "GDP in 2015 (thousands of dollars)")

head(county_tidy_df, 10)

##     FIPS County Name State Line Code                           Industry Name
## 3  01001     Autauga    AL         1                          All Industries
## 4  01001     Autauga    AL         2      Private goods-producing industries
## 5  01001     Autauga    AL         3   Private services-providing industries
## 6  01001     Autauga    AL         4   Government and government enterprises
## 7  01003     Baldwin    AL         1                          All Industries
## 8  01003     Baldwin    AL         2      Private goods-producing industries
## 9  01003     Baldwin    AL         3   Private services-providing industries
## 10 01003     Baldwin    AL         4   Government and government enterprises
## 11 01005     Barbour    AL         1                          All Industries
## 12 01005     Barbour    AL         2      Private goods-producing industries
##    GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3                             1383941                            1363368
## 4                              286396                             310468
## 5                              948490                             904599
## 6                              149055                             148301
## 7                             5599194                            6365080
## 8                              681871                             698500
## 9                             4306173                            5037325
## 10                             611151                             629255
## 11                             639833                             701750
## 12                             253232                             303099
##    GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3                             1402516                            1539406
## 4                              323582                             346355
## 5                              928438                            1037309
## 6                              150496                             155742
## 7                             6547396                            6436107
## 8                              711443                             735432
## 9                             5194273                            5023903
## 10                             641681                             676772
## 11                             689212                             743779
## 12                             286467                             307299

Data Analysis on the County GDP Dataset

# change the GDP column from char type to numeric
county_tidy_df[6:9] <- lapply(county_tidy_df[6:9],as.numeric)

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

## Warning in lapply(county_tidy_df[6:9], as.numeric): NAs introduced by coercion

# calculate the sum and average of all GDPs per county
industry_df <- county_tidy_df %>%
  mutate(industry_total = rowSums(county_tidy_df[6:9], na.rm = TRUE),
         industry_average_gdp = round(rowMeans(county_tidy_df[6:9], na.rm =TRUE), digits = 2),
         High_GDP = apply(county_tidy_df[6:9], 1, max, na.rm = TRUE),
         Low_GDP = apply(county_tidy_df[6:9], 1, min, na.rm = TRUE))

# rename the column
colnames(industry_df)[10] <- "Industry Total"
colnames(industry_df)[11] <- "Industry Avg GDP"
colnames(industry_df)[12] <- "Highest GDP in the County"
colnames(industry_df)[13] <- "Lowest GDP in the County"

head(industry_df)

##    FIPS County Name State Line Code                           Industry Name
## 3 01001     Autauga    AL         1                          All Industries
## 4 01001     Autauga    AL         2      Private goods-producing industries
## 5 01001     Autauga    AL         3   Private services-providing industries
## 6 01001     Autauga    AL         4   Government and government enterprises
## 7 01003     Baldwin    AL         1                          All Industries
## 8 01003     Baldwin    AL         2      Private goods-producing industries
##   GDP in 2012 (thousands of dollars) GDP in 2013 (thousands of dollars)
## 3                            1383941                            1363368
## 4                             286396                             310468
## 5                             948490                             904599
## 6                             149055                             148301
## 7                            5599194                            6365080
## 8                             681871                             698500
##   GDP in 2014 (thousands of dollars) GDP in 2015 (thousands of dollars)
## 3                            1402516                            1539406
## 4                             323582                             346355
## 5                             928438                            1037309
## 6                             150496                             155742
## 7                            6547396                            6436107
## 8                             711443                             735432
##   Industry Total Industry Avg GDP Highest GDP in the County
## 3        5689231        1422307.8                   1539406
## 4        1266801         316700.2                    346355
## 5        3818836         954709.0                   1037309
## 6         603594         150898.5                    155742
## 7       24947777        6236944.2                   6547396
## 8        2827246         706811.5                    735432
##   Lowest GDP in the County
## 3                  1363368
## 4                   286396
## 5                   904599
## 6                   148301
## 7                  5599194
## 8                   681871

# summarize the industry GDP per State
industry_plot_data <- subset(industry_df, `Industry Name` == "All Industries") %>%
  group_by(`State`) %>%
  summarise("Total GDP" = sum(`Industry Total`),
            "Highest GDP" = max(`Highest GDP in the County`),
            "Lowest GDP" = min(`Lowest GDP in the County`),
            )
# convert state abbreviations to name with a function [link]("https://rdrr.io/cran/usdata/src/R/abbr2state.R")

abbr2state <- function(abbr){
  ab    <- tolower(c("AL",
             "AK", "AZ", "KS", "UT", "CO", "CT",
             "DE", "FL", "GA", "HI", "ID", "IL",
             "IN", "IA", "AR", "KY", "LA", "ME",
             "MD", "MA", "MI", "MN", "MS", "MO",
             "MT", "NE", "NV", "NH", "NJ", "NM",
             "NY", "NC", "ND", "OH", "OK", "OR",
             "PA", "RI", "SC", "SD", "TN", "TX",
             "CA", "VT", "VA", "WA", "WV", "WI",
             "WY", "DC"))
  st    <- c("alabama",
             "alaska", "arizona", "kansas",
             "utah", "colorado", "connecticut",
             "delaware", "florida", "georgia",
             "hawaii", "idaho", "illinois",
             "indiana", "iowa", "arkansas",
             "kentucky", "louisiana", "maine",
             "maryland", "massachusetts", "michigan",
             "minnesota", "mississippi", "missouri",
             "montana", "nebraska", "nevada",
             "new hampshire", "new jersey", "new mexico",
             "new york", "north carolina", "north dakota",
             "ohio", "oklahoma", "oregon",
             "pennsylvania", "rhode island", "south carolina",
             "south dakota", "tennessee", "texas",
             "california", "vermont", "virginia",
             "washington", "west virginia", "wisconsin",
             "wyoming", "district of columbia")
  st[match(tolower(abbr), ab)]
}

industry_plot_data["Full Name"] <- abbr2state(industry_plot_data$State)

head(industry_plot_data, 10)

## # A tibble: 10 x 5
##    State `Total GDP` `Highest GDP` `Lowest GDP` `Full Name`         
##    <chr>       <dbl>         <dbl>        <dbl> <chr>               
##  1 AK      220476432      27571441        23255 alaska              
##  2 AL      772157204      42641286       118371 alabama             
##  3 AR      458771584      29087938        73313 arkansas            
##  4 AZ     1125178114     215381372       496754 arizona             
##  5 CA     9360952426     691948578        48646 california          
##  6 CO     1186164551      75738881        18005 colorado            
##  7 CT      999163128      93157588      3919490 connecticut         
##  8 DC      473056501     125434630    112736453 district of columbia
##  9 DE      260738432      49375769      5877245 delaware            
## 10 FL     3304723391     141734334       135069 florida

# graph a map of the industry GDP
state <- map_data("state")
industry_map <- merge(state, industry_plot_data, by.x = "region", by.y = "Full Name")
industry_map <- arrange(industry_map, group, order)

ggplot(industry_map, aes(x = long, y = lat, group = group, fill = `Total GDP`)) +
  geom_polygon(colour = "black")

Conclusion on the County GDP Dataset

In conclusion, it shows the total GDP in all industries in US and it appears that California have the highest GDP among all states.

Data Cleaning on the Interest Rate Dataset

# rename a column
colnames(interest_file) <- c("Date", "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil")

head(interest_file)

##         Date Federal Reserve System European Central Bank Swiss National Bank
## 1 1970-01-01                     NA                    NA                  NA
## 2 1970-02-01                     NA                    NA                  NA
## 3 1970-03-01                     NA                    NA                  NA
## 4 1970-04-01                     NA                    NA                  NA
## 5 1970-05-01                     NA                    NA                  NA
## 6 1970-06-01                     NA                    NA                  NA
##   Bank of England Reserve Bank of Australia Bank of Japan Bank of Brazil
## 1             8.0                        NA            NA             NA
## 2             8.0                        NA            NA             NA
## 3             8.0                        NA            NA             NA
## 4             7.5                        NA            NA             NA
## 5             7.0                        NA            NA             NA
## 6             7.0                        NA            NA             NA

Data Analysis on the Interest Rate Dataset

# filter the interest rate starting from the 2000
twentyfirst_century_df <- interest_file[interest_file$Date >= "2000-01-01" & interest_file$Date <= "2022-12-31", ]
twentyfirst_century_df$Date <- as.Date(twentyfirst_century_df$Date)

# pivot longer the central banks and interest rate
interest_rate_in_21stdata <- twentyfirst_century_df %>%
  pivot_longer(c( "Federal Reserve System", "European Central Bank", "Swiss National Bank", "Bank of England", "Reserve Bank of Australia", "Bank of Japan", "Bank of Brazil"), names_to = "Central Bank Name", values_to = "Interest Rate")

head(interest_rate_in_21stdata)

## # A tibble: 6 x 3
##   Date       `Central Bank Name`       `Interest Rate`
##   <date>     <chr>                               <dbl>
## 1 2000-01-01 Federal Reserve System               NA  
## 2 2000-01-01 European Central Bank                 3  
## 3 2000-01-01 Swiss National Bank                  NA  
## 4 2000-01-01 Bank of England                       5.5
## 5 2000-01-01 Reserve Bank of Australia             5  
## 6 2000-01-01 Bank of Japan                        NA

ggplot(interest_rate_in_21stdata, aes(x = `Date`, y = `Interest Rate`, color = `Central Bank Name`)) +
  geom_point() +
  geom_hline(yintercept = 2, color = "red")

## Warning: Removed 5047 rows containing missing values (`geom_point()`).

Conclusion on the Interest Rate Dataset

In conclusion, the interest rate in 21st century shows the economic situation of every central bank’s home country. For economy to be in good health, the interest rate usually need to be in in the range of 0 to 2%. Given that, Bank of Brazil have high interest rate to combat their economic uncertainty. Other central banks such as Bank of England and European Central Bank have been dropping their interest rates which show economic improvement.

607_project_2

Eddie Xu

2023-03-04