Description of the Project

Choose any three of the “wide” datasets identified in the Week 6 Discussion items.

Dataset Used: Population Estimate (2010 - 2014)

Suggested analysis:

Steps

Attach libraries, load and show raw data

library(tidyr)
library(dplyr)
library(stringr)
library(knitr)
library(DT)
rawdata = read.csv("https://raw.githubusercontent.com/L-Velasco/Fall16_IS607/master/NST-EST2014-01-region-CSV.csv", stringsAsFactors = FALSE)

datatable(rawdata)

Tidy the data

Used gather() to make the different years be part of observation and reformat the population numbers from character (having “,”) to numeric for later calculations.

tidydata <- gather(rawdata,"Year","Estimate",3:8)
## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored
tidydata$Estimate <- as.numeric(str_replace_all(tidydata$Estimate,"[,]",""))

datatable(tidydata)

Transform the data

The main transformation process below creates new datasets which aim to prepare and calculate the year by year population change in each geographic area.

The FromYear and ToYear datasets hold all the information to compare one year to another (e.g. compare Estimate base column vs 2010, 2010 vs 2011, 2011 vs 2012, and so on.)

The YearlyGrowth and AveGrowth datasets hold the summarized information of the change in population.

First few observations from each newly formed datasets are also shown below

FromYear <- tidydata %>% 
  filter(Year != "X2014") %>% 
  arrange(Geographic.Area, Year) %>% 
  rename(Area1=Geographic.Area,
         Region1=USRegion,
         Year1 = Year,
         Estimate1 = Estimate)
head(FromYear)
##      Area1 Region1          Year1 Estimate1
## 1 .Alabama   South Estimates.Base   4780127
## 2 .Alabama   South          X2010   4785822
## 3 .Alabama   South          X2011   4801695
## 4 .Alabama   South          X2012   4817484
## 5 .Alabama   South          X2013   4833996
## 6  .Alaska    West Estimates.Base    710249
ToYear <- tidydata %>% 
  filter(Year != "Estimates.Base") %>% 
  arrange(Geographic.Area, Year)
head(ToYear)
##   Geographic.Area USRegion  Year Estimate
## 1        .Alabama    South X2010  4785822
## 2        .Alabama    South X2011  4801695
## 3        .Alabama    South X2012  4817484
## 4        .Alabama    South X2013  4833996
## 5        .Alabama    South X2014  4849377
## 6         .Alaska     West X2010   713856
YearlyGrowth <- bind_cols(FromYear,ToYear) %>% 
  mutate(pct_growth = round((Estimate - Estimate1) / Estimate1 * 100,2)) %>%
  select(Geographic.Area, USRegion, Year, pct_growth)

AveGrowth <- YearlyGrowth %>% 
  group_by(Geographic.Area) %>% 
  summarise("Ave % Growth" = mean(pct_growth)) %>% 
  arrange(Geographic.Area) %>% 
  rename("Area"=Geographic.Area)
head(AveGrowth)
## # A tibble: 6 × 2
##          Area `Ave % Growth`
##         <chr>          <dbl>
## 1    .Alabama          0.288
## 2     .Alaska          0.738
## 3    .Arizona          1.040
## 4   .Arkansas          0.344
## 5 .California          0.820
## 6   .Colorado          1.268
YearlyGrowth$Year <- str_replace_all(YearlyGrowth$Year,"X","")

YearlyGrowth <- YearlyGrowth %>% 
  spread(Year,pct_growth) %>% 
  arrange(Geographic.Area)
head(YearlyGrowth)
##   Geographic.Area USRegion 2010 2011 2012 2013  2014
## 1        .Alabama    South 0.12 0.33 0.33 0.34  0.32
## 2         .Alaska     West 0.51 1.22 1.18 0.85 -0.07
## 3        .Arizona     West 0.31 0.95 1.29 1.20  1.45
## 4       .Arkansas    South 0.22 0.55 0.37 0.32  0.26
## 5     .California     West 0.22 0.98 0.96 0.97  0.97
## 6       .Colorado     West 0.38 1.41 1.41 1.55  1.59

Analyze the data


#1. Year over year change in population by state (showing percentage of change)

state_growth <- bind_cols(YearlyGrowth,AveGrowth) %>% 
  filter(str_detect(Geographic.Area,"[.]")) %>% 
  select(-USRegion, -Area)  %>% 
  rename("State"=Geographic.Area)
datatable(state_growth)


#2. Year over year change in population by region of the US (showing percentage of change)

region_growth <- bind_cols(YearlyGrowth,AveGrowth) %>% 
  filter(Geographic.Area %in% c("Northeast", "Midwest", "South","West")) %>% 
  select(-USRegion, -Area)  %>% 
  rename("US Region"=Geographic.Area)
kable(region_growth)
US Region 2010 2011 2012 2013 2014 Ave % Growth
Midwest 0.06 0.26 0.27 0.35 0.26 0.240
Northeast 0.11 0.46 0.35 0.35 0.22 0.298
South 0.27 1.06 1.08 1.00 1.05 0.892
West 0.24 1.00 1.04 1.05 1.09 0.884


#3a. Overall US population growth (showing percentage of change)

US_growth <- bind_cols(YearlyGrowth,AveGrowth) %>% 
  filter(Geographic.Area == "United States") %>% 
  select(-USRegion, -Area)  %>% 
  rename("Country"=Geographic.Area)
kable(US_growth)
Country 2010 2011 2012 2013 2014 Ave % Growth
United States 0.19 0.77 0.77 0.76 0.75 0.648


#3b. Overall Puerto Rico population growth (showing percentage of change)

PR_growth <- bind_cols(YearlyGrowth,AveGrowth) %>% 
  filter(Geographic.Area == "Puerto Rico") %>% 
  select(-USRegion, -Area)  %>% 
  rename("Country"=Geographic.Area)
kable(PR_growth)
Country 2010 2011 2012 2013 2014 Ave % Growth
Puerto Rico -0.12 -0.93 -1.21 -1.28 -1.32 -0.972

A small side note is that based on estimate, there seems to be steady population decline for Puerto Rico