Choose any three of the “wide” datasets identified in the Week 6 Discussion items.
Dataset Used: Population Estimate (2010 - 2014)
Suggested analysis:
library(tidyr)
library(dplyr)
library(stringr)
library(knitr)
library(DT)
rawdata = read.csv("https://raw.githubusercontent.com/L-Velasco/Fall16_IS607/master/NST-EST2014-01-region-CSV.csv", stringsAsFactors = FALSE)
datatable(rawdata)
Used gather() to make the different years be part of observation and reformat the population numbers from character (having “,”) to numeric for later calculations.
tidydata <- gather(rawdata,"Year","Estimate",3:8)
## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored
tidydata$Estimate <- as.numeric(str_replace_all(tidydata$Estimate,"[,]",""))
datatable(tidydata)
The main transformation process below creates new datasets which aim to prepare and calculate the year by year population change in each geographic area.
The FromYear and ToYear datasets hold all the information to compare one year to another (e.g. compare Estimate base column vs 2010, 2010 vs 2011, 2011 vs 2012, and so on.)
The YearlyGrowth and AveGrowth datasets hold the summarized information of the change in population.
First few observations from each newly formed datasets are also shown below
FromYear <- tidydata %>%
filter(Year != "X2014") %>%
arrange(Geographic.Area, Year) %>%
rename(Area1=Geographic.Area,
Region1=USRegion,
Year1 = Year,
Estimate1 = Estimate)
head(FromYear)
## Area1 Region1 Year1 Estimate1
## 1 .Alabama South Estimates.Base 4780127
## 2 .Alabama South X2010 4785822
## 3 .Alabama South X2011 4801695
## 4 .Alabama South X2012 4817484
## 5 .Alabama South X2013 4833996
## 6 .Alaska West Estimates.Base 710249
ToYear <- tidydata %>%
filter(Year != "Estimates.Base") %>%
arrange(Geographic.Area, Year)
head(ToYear)
## Geographic.Area USRegion Year Estimate
## 1 .Alabama South X2010 4785822
## 2 .Alabama South X2011 4801695
## 3 .Alabama South X2012 4817484
## 4 .Alabama South X2013 4833996
## 5 .Alabama South X2014 4849377
## 6 .Alaska West X2010 713856
YearlyGrowth <- bind_cols(FromYear,ToYear) %>%
mutate(pct_growth = round((Estimate - Estimate1) / Estimate1 * 100,2)) %>%
select(Geographic.Area, USRegion, Year, pct_growth)
AveGrowth <- YearlyGrowth %>%
group_by(Geographic.Area) %>%
summarise("Ave % Growth" = mean(pct_growth)) %>%
arrange(Geographic.Area) %>%
rename("Area"=Geographic.Area)
head(AveGrowth)
## # A tibble: 6 × 2
## Area `Ave % Growth`
## <chr> <dbl>
## 1 .Alabama 0.288
## 2 .Alaska 0.738
## 3 .Arizona 1.040
## 4 .Arkansas 0.344
## 5 .California 0.820
## 6 .Colorado 1.268
YearlyGrowth$Year <- str_replace_all(YearlyGrowth$Year,"X","")
YearlyGrowth <- YearlyGrowth %>%
spread(Year,pct_growth) %>%
arrange(Geographic.Area)
head(YearlyGrowth)
## Geographic.Area USRegion 2010 2011 2012 2013 2014
## 1 .Alabama South 0.12 0.33 0.33 0.34 0.32
## 2 .Alaska West 0.51 1.22 1.18 0.85 -0.07
## 3 .Arizona West 0.31 0.95 1.29 1.20 1.45
## 4 .Arkansas South 0.22 0.55 0.37 0.32 0.26
## 5 .California West 0.22 0.98 0.96 0.97 0.97
## 6 .Colorado West 0.38 1.41 1.41 1.55 1.59
#1. Year over year change in population by state (showing percentage of change)
state_growth <- bind_cols(YearlyGrowth,AveGrowth) %>%
filter(str_detect(Geographic.Area,"[.]")) %>%
select(-USRegion, -Area) %>%
rename("State"=Geographic.Area)
datatable(state_growth)
#2. Year over year change in population by region of the US (showing percentage of change)
region_growth <- bind_cols(YearlyGrowth,AveGrowth) %>%
filter(Geographic.Area %in% c("Northeast", "Midwest", "South","West")) %>%
select(-USRegion, -Area) %>%
rename("US Region"=Geographic.Area)
kable(region_growth)
| US Region | 2010 | 2011 | 2012 | 2013 | 2014 | Ave % Growth |
|---|---|---|---|---|---|---|
| Midwest | 0.06 | 0.26 | 0.27 | 0.35 | 0.26 | 0.240 |
| Northeast | 0.11 | 0.46 | 0.35 | 0.35 | 0.22 | 0.298 |
| South | 0.27 | 1.06 | 1.08 | 1.00 | 1.05 | 0.892 |
| West | 0.24 | 1.00 | 1.04 | 1.05 | 1.09 | 0.884 |
#3a. Overall US population growth (showing percentage of change)
US_growth <- bind_cols(YearlyGrowth,AveGrowth) %>%
filter(Geographic.Area == "United States") %>%
select(-USRegion, -Area) %>%
rename("Country"=Geographic.Area)
kable(US_growth)
| Country | 2010 | 2011 | 2012 | 2013 | 2014 | Ave % Growth |
|---|---|---|---|---|---|---|
| United States | 0.19 | 0.77 | 0.77 | 0.76 | 0.75 | 0.648 |
#3b. Overall Puerto Rico population growth (showing percentage of change)
PR_growth <- bind_cols(YearlyGrowth,AveGrowth) %>%
filter(Geographic.Area == "Puerto Rico") %>%
select(-USRegion, -Area) %>%
rename("Country"=Geographic.Area)
kable(PR_growth)
| Country | 2010 | 2011 | 2012 | 2013 | 2014 | Ave % Growth |
|---|---|---|---|---|---|---|
| Puerto Rico | -0.12 | -0.93 | -1.21 | -1.28 | -1.32 | -0.972 |
A small side note is that based on estimate, there seems to be steady population decline for Puerto Rico