library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readr)
CDCBirths <- read_delim("~/Downloads/CDCBirths.txt", "\t", escape_double = FALSE)
## Warning: 45 parsing failures.
## row col expected actual
## 1430 -- 8 columns 1 columns
## 1431 -- 8 columns 1 columns
## 1432 -- 8 columns 1 columns
## 1433 -- 8 columns 1 columns
## 1434 -- 8 columns 1 columns
## .... ... ......... .........
## .See problems(...) for more details.
CDCBirths %>%
filter(!is.na(GenderCode)) %>%
select(State,Year,GenderCode,Births) -> Births
Recall how we combined the boys and girls rows for each year. The package tidyr allows us to do this in a different way.
Births %>%
filter(GenderCode=="M") %>%
rename(Boys=Births) %>%
select(State,Year,Boys) -> Boys
Births %>%
filter(GenderCode=="F") %>%
rename(Girls=Births, GState = State, GYear = Year) %>%
select(GState,GYear,Girls) -> Girls
Both = cbind(Boys,Girls)
sum(Both$Gyear != Both$Year | Both$GState != Both$State)
## [1] 0
# After checking drop extra state and year variables.
Both %>% select(-c(GState,GYear)) -> Both
Both$PctBoys = Both$Boys/(Both$Boys + Both$Girls)
That was how we did this without using the capabilities of tidyr. Stop and look at tidyr spread documentation.
Both2 = Births %>%
filter(GenderCode %in% c("F","M")) %>%
select(State,Year,GenderCode,Births) %>%
spread(GenderCode,Births) %>%
mutate(PctBoys = M/(M + F))
glimpse(Both2)
## Observations: 459
## Variables: 5
## $ State <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",...
## $ Year <int> 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, ...
## $ F <int> 31942, 31672, 30725, 29227, 28897, 28772, 28535, 29136...
## $ M <int> 32862, 32874, 31750, 30823, 30457, 29676, 29632, 30286...
## $ PctBoys <dbl> 0.5070983, 0.5093112, 0.5082033, 0.5132889, 0.5131415,...
Let’s see if we got the same result in both processes.
diffPct = Both2$PctBoys - Both$PctBoys
summary(diffPct)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
Now we want to see if ratios are persistent. To do this we need to make our observations states, with the ratio values for each year in separate variables.
Both2 %>%
select(State,Year,PctBoys) %>%
spread(key=Year,sep="",value=PctBoys) -> States
glimpse(States)
## Observations: 51
## Variables: 10
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californ...
## $ Year2007 <dbl> 0.5070983, 0.5138436, 0.5115604, 0.5121804, 0.5120124...
## $ Year2008 <dbl> 0.5093112, 0.5177417, 0.5105489, 0.5114215, 0.5124117...
## $ Year2009 <dbl> 0.5082033, 0.5267573, 0.5111748, 0.5149467, 0.5122178...
## $ Year2010 <dbl> 0.5132889, 0.5104176, 0.5091738, 0.5117800, 0.5119875...
## $ Year2011 <dbl> 0.5131415, 0.5130936, 0.5120700, 0.5110681, 0.5122839...
## $ Year2012 <dbl> 0.5077334, 0.5104139, 0.5121991, 0.5130258, 0.5110401...
## $ Year2013 <dbl> 0.5094297, 0.5116198, 0.5136332, 0.5122647, 0.5124165...
## $ Year2014 <dbl> 0.5096766, 0.5165906, 0.5125968, 0.5127626, 0.5124752...
## $ Year2015 <dbl> 0.5087584, 0.5131182, 0.5126009, 0.5129610, 0.5128297...
As a digression, how would we reverse this? The opposite of spread is gather. Stop and look at tidyr gather
Both3 = States %>%
gather(Year,PctBoys,Year2007:Year2015) %>%
mutate(Year = substr(Year,5,8))
glimpse(Both3)
## Observations: 459
## Variables: 3
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ Year <chr> "2007", "2007", "2007", "2007", "2007", "2007", "2007"...
## $ PctBoys <dbl> 0.5070983, 0.5138436, 0.5115604, 0.5121804, 0.5120124,...
Returning from our digression, how do we examine the question of persistence. We can do a correlation analysis of the state ratios for separate years. Does a hight ratio in one year tend to be correlate with a high ratio in other years?
I asked Google how to plot a correlation matrix in R. Here’s one of the results. http://www.sthda.com/english/wiki/visualize-correlation-matrix-using-correlogram
I decided to use the package corrplot and try out the various options. Make sure it is listed in your packages tab.
library(corrplot)
First create a correlation matrix of the numerical variables in the States dataframe. We just need to eliminate the name of the state.
States %>% select(-State) %>% cor() -> M
M
## Year2007 Year2008 Year2009 Year2010 Year2011
## Year2007 1.000000000 0.29658567 -0.1020505 0.01149055 -0.004605409
## Year2008 0.296585675 1.00000000 0.1368398 0.07721920 0.282043084
## Year2009 -0.102050537 0.13683979 1.0000000 0.22847921 0.192316952
## Year2010 0.011490547 0.07721920 0.2284792 1.00000000 0.019087496
## Year2011 -0.004605409 0.28204308 0.1923170 0.01908750 1.000000000
## Year2012 -0.145539510 -0.12509436 0.4288843 0.09009395 0.081946158
## Year2013 -0.274048690 0.01550372 0.3530217 0.44504922 0.141825544
## Year2014 -0.000111667 0.15718548 0.5350325 0.10499700 0.200271688
## Year2015 0.032806311 0.03745095 0.3693287 0.00621416 0.076204276
## Year2012 Year2013 Year2014 Year2015
## Year2007 -0.14553951 -0.27404869 -0.000111667 0.03280631
## Year2008 -0.12509436 0.01550372 0.157185480 0.03745095
## Year2009 0.42888427 0.35302172 0.535032493 0.36932872
## Year2010 0.09009395 0.44504922 0.104996998 0.00621416
## Year2011 0.08194616 0.14182554 0.200271688 0.07620428
## Year2012 1.00000000 0.30822439 0.493573171 0.70302020
## Year2013 0.30822439 1.00000000 0.401558501 0.22096833
## Year2014 0.49357317 0.40155850 1.000000000 0.39458016
## Year2015 0.70302020 0.22096833 0.394580156 1.00000000
Now we can play with the various methods to display the correlation matrix.
corrplot(M, method="circle")
corrplot(M, method="pie")
corrplot(M, method="color")
corrplot(M, method="number")
We can also use different layouts.
corrplot(M, type="upper")
corrplot(M, type="lower")
We can use different orderings of the variables.
# correlogram with hclust reordering
corrplot(M, type="upper", order="hclust")
Try a few different options from the website.