Data 607 Project 2

library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

First, load data into Rstudio from Github. I chose the marriage rates data set. The ask of this rate set is to, “[the] data is lost in having it as a column name, and so column headers are values that describe demographics such as education level, age, and whether or not they have children – they are not variables.” What I will attempt to do is rename many of the headers and make the year the column header

marriagedata <- read.csv(file = "https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/both_sexes.csv")
marriage <- tbl_df(marriagedata)
glimpse(marriage)
## Observations: 17
## Variables: 75
## $ X                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ year             <int> 1960, 1970, 1980, 1990, 2000, 2001, 2002, 200...
## $ date             <fctr> 1960-01-01, 1970-01-01, 1980-01-01, 1990-01-...
## $ all_2534         <dbl> 0.1233145, 0.1269715, 0.1991767, 0.2968306, 0...
## $ HS_2534          <dbl> 0.1095332, 0.1094000, 0.1617313, 0.2777491, 0...
## $ SC_2534          <dbl> 0.1522818, 0.1495096, 0.2236916, 0.2780912, 0...
## $ BAp_2534         <dbl> 0.2389952, 0.2187031, 0.2881646, 0.3612968, 0...
## $ BAo_2534         <dbl> 0.2389952, 0.2187031, 0.2881646, 0.3656655, 0...
## $ GD_2534          <dbl> NA, NA, NA, 0.3474505, 0.3691740, 0.3590304, ...
## $ White_2534       <dbl> 0.1164848, 0.1179043, 0.1824126, 0.2639256, 0...
## $ Black_2534       <dbl> 0.1621855, 0.1855163, 0.3137500, 0.4838556, 0...
## $ Hisp_2534        <dbl> 0.1393736, 0.1298769, 0.1885440, 0.2962372, 0...
## $ NE_2534          <dbl> 0.1504184, 0.1517231, 0.2414327, 0.3500384, 0...
## $ MA_2534          <dbl> 0.1628934, 0.1640680, 0.2505925, 0.3623321, 0...
## $ Midwest_2534     <dbl> 0.1121467, 0.1153741, 0.1828339, 0.2755046, 0...
## $ South_2534       <dbl> 0.1090562, 0.1126220, 0.1688435, 0.2639794, 0...
## $ Mountain_2534    <dbl> 0.09152117, 0.10293602, 0.17434230, 0.2526432...
## $ Pacific_2534     <dbl> 0.1198758, 0.1374964, 0.2334279, 0.3319579, 0...
## $ poor_2534        <dbl> 0.1371597, 0.1717202, 0.3100591, 0.4199108, 0...
## $ mid_2534         <dbl> 0.07514929, 0.08159207, 0.14825303, 0.2432000...
## $ rich_2534        <dbl> 0.2066776, 0.1724093, 0.1851082, 0.2783226, 0...
## $ all_3544         <dbl> 0.07058157, 0.06732520, 0.06883378, 0.1119180...
## $ HS_3544          <dbl> 0.06860309, 0.06511964, 0.06429102, 0.1121004...
## $ SC_3544          <dbl> 0.06663695, 0.06271724, 0.06531333, 0.0969937...
## $ BAp_3544         <dbl> 0.1326265, 0.1116899, 0.1056102, 0.1285172, 0...
## $ BAo_3544         <dbl> 0.1326265, 0.1116899, 0.1056102, 0.1258567, 0...
## $ GD_3544          <dbl> NA, NA, NA, 0.1328018, 0.1550970, 0.1595169, ...
## $ White_3544       <dbl> 0.06825586, 0.06250372, 0.05966739, 0.0961131...
## $ Black_3544       <dbl> 0.08836728, 0.10290904, 0.13140081, 0.2201029...
## $ Hisp_3544        <dbl> 0.07307651, 0.07070500, 0.08110790, 0.1219420...
## $ NE_3544          <dbl> 0.09194322, 0.08570110, 0.07997323, 0.1278591...
## $ MA_3544          <dbl> 0.09347468, 0.09040725, 0.09744428, 0.1435498...
## $ Midwest_3544     <dbl> 0.06863360, 0.06156272, 0.06070641, 0.1015757...
## $ South_3544       <dbl> 0.06026353, 0.05966057, 0.05914089, 0.0963703...
## $ Mountain_3544    <dbl> 0.04739747, 0.04651163, 0.04880077, 0.0918990...
## $ Pacific_3544     <dbl> 0.05822486, 0.06347796, 0.07552538, 0.1313463...
## $ poor_3544        <dbl> 0.1019749, 0.1117548, 0.1291426, 0.2012208, 0...
## $ mid_3544         <dbl> 0.04717272, 0.04566838, 0.05050321, 0.0902473...
## $ rich_3544        <dbl> 0.08553870, 0.06499159, 0.04445951, 0.0657391...
## $ all_4554         <dbl> 0.07254649, 0.05968794, 0.05250871, 0.0594782...
## $ HS_4554          <dbl> 0.06840792, 0.05833439, 0.05036563, 0.0598824...
## $ SC_4554          <dbl> 0.07903755, 0.05443478, 0.04816180, 0.0465408...
## $ BAp_4554         <dbl> 0.15360889, 0.10466047, 0.08623774, 0.0730188...
## $ BAo_4554         <dbl> 0.15360889, 0.10466047, 0.08623774, 0.0641652...
## $ GD_4554          <dbl> NA, NA, NA, 0.08394886, 0.09362802, 0.0936287...
## $ White_4554       <dbl> 0.07246692, 0.05754799, 0.04765354, 0.0509255...
## $ Black_4554       <dbl> 0.06913249, 0.07899168, 0.08624602, 0.1161769...
## $ Hisp_4554        <dbl> 0.06636058, 0.05810740, 0.06522951, 0.0761355...
## $ NE_4554          <dbl> 0.10236412, 0.08028082, 0.06930253, 0.0704750...
## $ MA_4554          <dbl> 0.09264788, 0.07860635, 0.07508466, 0.0837313...
## $ Midwest_4554     <dbl> 0.07285321, 0.05791163, 0.04807290, 0.0539839...
## $ South_4554       <dbl> 0.05977295, 0.05174462, 0.04485348, 0.0504363...
## $ Mountain_4554    <dbl> 0.04754183, 0.03970134, 0.03374438, 0.0445941...
## $ Pacific_4554     <dbl> 0.05996993, 0.04826312, 0.04958992, 0.0646187...
## $ poor_4554        <dbl> 0.1030055, 0.1016489, 0.1003011, 0.1148335, 0...
## $ mid_4554         <dbl> 0.05364421, 0.04221637, 0.03830266, 0.0456233...
## $ rich_4554        <dbl> 0.07908591, 0.05142867, 0.03311296, 0.0313638...
## $ nokids_all_2534  <dbl> 0.4640564, 0.4309043, 0.4464304, 0.5425242, 0...
## $ kids_all_2534    <dbl> 0.002820625, 0.009868596, 0.025285667, 0.0602...
## $ nokids_HS_2534   <dbl> 0.4430148, 0.4246779, 0.4319342, 0.5464881, 0...
## $ nokids_SC_2534   <dbl> 0.5000402, 0.4333479, 0.4505900, 0.5238446, 0...
## $ nokids_BAp_2534  <dbl> 0.5619099, 0.4554766, 0.4719700, 0.5560765, 0...
## $ nokids_BAo_2534  <dbl> 0.5619099, 0.4554766, 0.4719700, 0.5633301, 0...
## $ nokids_GD_2534   <dbl> NA, NA, NA, 0.5332628, 0.5367160, 0.5258800, ...
## $ kids_HS_2534     <dbl> 0.003318886, 0.012465915, 0.031930752, 0.0784...
## $ kids_SC_2534     <dbl> 0.001150824, 0.003699982, 0.018135401, 0.0520...
## $ kids_BAp_2534    <dbl> 0.0005751073, 0.0014683425, 0.0062544364, 0.0...
## $ kids_BAo_2534    <dbl> 0.0005751073, 0.0014683425, 0.0062544364, 0.0...
## $ kids_GD_2534     <dbl> NA, NA, NA, 0.01374234, 0.02761467, 0.0264504...
## $ nokids_poor_2534 <dbl> 0.4933061, 0.5097742, 0.5740402, 0.6546908, 0...
## $ nokids_mid_2534  <dbl> 0.4100080, 0.3764538, 0.3998250, 0.5186604, 0...
## $ nokids_rich_2534 <dbl> 0.4921184, 0.4288948, 0.3848089, 0.4750156, 0...
## $ kids_poor_2534   <dbl> 0.008722711, 0.029974945, 0.077926214, 0.1707...
## $ kids_mid_2534    <dbl> 0.0007532065, 0.0033771145, 0.0102368871, 0.0...
## $ kids_rich_2534   <dbl> 0.0008027331, 0.0030435661, 0.0068317224, 0.0...

replace the na values with 0. Since there is a lot of data here, I would like to focus on education level and age

marriage[is.na(marriage)] <- 0
marriage <- subset(marriage, select =  (c("year", "date", "X", "all_2534", "HS_2534", "BAp_2534","GD_2534", "all_3544", "HS_3544", "BAp_3544", "GD_3544", "all_4554", "HS_4554", "BAp_4554", "GD_4554")))
colnames(marriage)[4] <- "All 24-34"
colnames(marriage)[5] <- "HighSchool 24-34"
colnames(marriage)[6] <- "BachDegree 24-34"
colnames(marriage)[7] <- "GradDegree 24-34"
colnames(marriage)[8] <- "All 34-44"
colnames(marriage)[9] <- "HighSchool 34-44"
colnames(marriage)[10] <- "BachDegree 34-44"
colnames(marriage)[11] <- "GradDegree 34-44"
colnames(marriage)[12] <- "All 44-54"
colnames(marriage)[13] <- "HighSchool 44-54"
colnames(marriage)[14] <- "BachDegree 44-54"
colnames(marriage)[15] <- "GradDegree 44-54"

I’d like to spread the data by year, to get a better look at how the different variables have changed

md1 <- melt(marriage, id.vars = c("year", "date", "X"))
md2<- md1 %>% 
  select(year, variable, value) %>% 
  group_by(year)
md2
## # A tibble: 204 x 3
## # Groups:   year [17]
##     year  variable     value
##    <int>    <fctr>     <dbl>
##  1  1960 All 24-34 0.1233145
##  2  1970 All 24-34 0.1269715
##  3  1980 All 24-34 0.1991767
##  4  1990 All 24-34 0.2968306
##  5  2000 All 24-34 0.3450087
##  6  2001 All 24-34 0.3527767
##  7  2002 All 24-34 0.3535249
##  8  2003 All 24-34 0.3620345
##  9  2004 All 24-34 0.3673247
## 10  2005 All 24-34 0.3793451
## # ... with 194 more rows
md2 %>% 
  spread(year, value) %>% 
  select("variable", "1960", "1970", "1980", "1990", "2000", "2010")
## # A tibble: 12 x 7
##            variable     `1960`     `1970`     `1980`     `1990`     `2000`
##  *           <fctr>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1        All 24-34 0.12331447 0.12697147 0.19917674 0.29683059 0.34500872
##  2 HighSchool 24-34 0.10953316 0.10940002 0.16173134 0.27774914 0.33165454
##  3 BachDegree 24-34 0.23899523 0.21870310 0.28816462 0.36129679 0.38749056
##  4 GradDegree 24-34 0.00000000 0.00000000 0.00000000 0.34745052 0.36917399
##  5        All 34-44 0.07058157 0.06732520 0.06883378 0.11191800 0.15605881
##  6 HighSchool 34-44 0.06860309 0.06511964 0.06429102 0.11210043 0.16993703
##  7 BachDegree 34-44 0.13262647 0.11168988 0.10561021 0.12851723 0.15412382
##  8 GradDegree 34-44 0.00000000 0.00000000 0.00000000 0.13280181 0.15509702
##  9        All 44-54 0.07254649 0.05968794 0.05250871 0.05947824 0.08804394
## 10 HighSchool 44-54 0.06840792 0.05833439 0.05036563 0.05988244 0.09442809
## 11 BachDegree 44-54 0.15360889 0.10466047 0.08623774 0.07301884 0.09208417
## 12 GradDegree 44-54 0.00000000 0.00000000 0.00000000 0.08394886 0.09362802
## # ... with 1 more variables: `2010` <dbl>

Now, I’d like to plot that variables against the value of the year

ggplot(md2, aes(year, value)) +
 geom_point(aes(color = variable)) +
  xlab("Years") +
  ylab ("Marriage Rate") 

Basic analysis: With this graph, we can see that in the 1960’s, the marriage rate was highest amongst 24-34 year olds with a bachelor degree. As the years progressed, the marriage rate amongst 24-34 year olds with just a high school diploma became the highest.