Project 2 Dataset1

url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset1/state_marriage_rates_90_95_99-16.xlsx"

library(httr)
#Solution to read xlsx files from github was found here: https://community.rstudio.com/t/read-xlsx-from-github/9386/4

#This solution was used for dataset 2 as well.

temp_file <- tempfile(fileext = ".xlsx")
req <- GET(url, 
          # authenticate using GITHUB_PAT
           authenticate(Sys.getenv("GITHUB_PAT"), ""),
          # write result to disk
           write_disk(path = temp_file))
tab <- readxl::read_excel(temp_file)


#Get a view of the data
str(tab)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51 obs. of  21 variables:
##  $ State: chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ 2016 : num  7.15 7.1 5.93 9.86 6.46 ...
##  $ 2015 : num  7.35 7.41 5.92 10.04 6.18 ...
##  $ 2014 : num  7.81 7.51 5.78 10.11 6.44 ...
##  $ 2013 : num  7.82 7.29 5.4 9.75 6.46 ...
##  $ 2012 : num  8.2 7.2 5.6 10.9 6 6.8 5.2 5.8 8.4 7.2 ...
##  $ 2011 : num  8.4 7.8 5.7 10.4 5.8 7 5.5 5.2 8.7 7.4 ...
##  $ 2010 : num  8.2 8 5.9 10.8 5.8 6.9 5.6 5.2 7.6 7.3 ...
##  $ 2009 : num  8.3 7.8 5.6 10.7 5.8 6.9 5.9 5.4 4.7 7.5 ...
##  $ 2008 : num  8.6 8.4 6 10.6 6.7 7.4 5.4 5.5 4.1 8 ...
##  $ 2007 : num  8.9 8.5 6.4 12 6.2 7.1 5.5 5.7 4.2 8.5 ...
##  $ 2006 : num  9.2 8.2 6.5 12.4 6.3 7.2 5.5 5.9 4 8.6 ...
##  $ 2005 : num  9.2 8.2 6.6 12.9 6.4 7.6 5.8 5.9 4.1 8.9 ...
##  $ 2004 : num  9.4 8.5 6.7 13.4 6.4 7.4 5.8 6.1 5.2 9 ...
##  $ 2003 : num  9.6 8.1 6.5 13.4 6.1 7.8 5.5 6 5.1 9 ...
##  $ 2002 : num  9.9 8.3 6.7 14.3 6.2 8 5.7 6.4 5.1 9.4 ...
##  $ 2001 : num  9.4 8.1 7.6 14.3 6.5 8.2 5.4 6.5 6.2 9.3 ...
##  $ 2000 : num  10.1 8.9 7.5 15.4 5.8 8.3 5.7 6.5 4.9 8.9 ...
##  $ 1999 : num  10.8 8.6 8.2 14.8 6.4 8.2 5.8 6.7 6.6 8.7 ...
##  $ 1995 : num  9.8 9 8.8 14.4 6.3 9 6.6 7.3 6.1 9.9 ...
##  $ 1990 : num  10.6 10.2 10 15.3 7.9 9.8 7.9 8.4 8.2 10.9 ...
names(tab)
##  [1] "State" "2016"  "2015"  "2014"  "2013"  "2012"  "2011"  "2010"  "2009" 
## [10] "2008"  "2007"  "2006"  "2005"  "2004"  "2003"  "2002"  "2001"  "2000" 
## [19] "1999"  "1995"  "1990"
summary(tab)
##     State                2016             2015             2014       
##  Length:51          Min.   : 5.357   Min.   : 5.292   Min.   : 5.369  
##  Class :character   1st Qu.: 6.128   1st Qu.: 6.172   1st Qu.: 6.329  
##  Mode  :character   Median : 6.852   Median : 6.857   Median : 6.893  
##                     Mean   : 7.418   Mean   : 7.456   Mean   : 7.700  
##                     3rd Qu.: 7.402   3rd Qu.: 7.413   3rd Qu.: 7.595  
##                     Max.   :28.392   Max.   :31.018   Max.   :31.851  
##                                                       NA's   :1       
##       2013             2012             2011             2010      
##  Min.   : 5.021   Min.   : 4.900   Min.   : 4.800   Min.   : 4.90  
##  1st Qu.: 6.305   1st Qu.: 5.800   1st Qu.: 5.850   1st Qu.: 5.80  
##  Median : 6.770   Median : 6.800   Median : 6.800   Median : 6.80  
##  Mean   : 7.537   Mean   : 7.527   Mean   : 7.616   Mean   : 7.62  
##  3rd Qu.: 7.367   3rd Qu.: 7.350   3rd Qu.: 7.650   3rd Qu.: 7.40  
##  Max.   :32.280   Max.   :35.100   Max.   :36.900   Max.   :38.30  
##  NA's   :1                                                         
##       2009             2008             2007             2006       
##  Min.   : 4.700   Min.   : 4.000   Min.   : 4.200   Min.   : 4.000  
##  1st Qu.: 5.800   1st Qu.: 5.950   1st Qu.: 6.150   1st Qu.: 6.350  
##  Median : 6.600   Median : 6.800   Median : 7.000   Median : 7.100  
##  Mean   : 7.535   Mean   : 7.784   Mean   : 8.182   Mean   : 8.432  
##  3rd Qu.: 7.400   3rd Qu.: 7.800   3rd Qu.: 7.800   3rd Qu.: 7.950  
##  Max.   :40.300   Max.   :42.300   Max.   :48.600   Max.   :52.100  
##                                                     NA's   :1       
##       2005             2004             2003             2002       
##  Min.   : 4.100   Min.   : 5.200   Min.   : 5.100   Min.   : 5.100  
##  1st Qu.: 6.500   1st Qu.: 6.500   1st Qu.: 6.500   1st Qu.: 6.525  
##  Median : 7.000   Median : 7.400   Median : 7.200   Median : 7.600  
##  Mean   : 8.684   Mean   : 8.984   Mean   : 9.032   Mean   : 9.294  
##  3rd Qu.: 8.250   3rd Qu.: 8.450   3rd Qu.: 8.400   3rd Qu.: 8.750  
##  Max.   :57.400   Max.   :62.100   Max.   :63.900   Max.   :67.400  
##                                    NA's   :1        NA's   :1       
##       2001             2000             1999            1995      
##  Min.   : 5.400   Min.   : 4.900   Min.   : 5.80   Min.   : 6.10  
##  1st Qu.: 6.775   1st Qu.: 6.900   1st Qu.: 7.05   1st Qu.: 7.30  
##  Median : 7.600   Median : 7.850   Median : 7.90   Median : 8.40  
##  Mean   : 9.468   Mean   : 9.764   Mean   : 9.92   Mean   :10.34  
##  3rd Qu.: 8.975   3rd Qu.: 9.400   3rd Qu.: 9.10   3rd Qu.: 9.90  
##  Max.   :69.600   Max.   :72.200   Max.   :82.30   Max.   :85.20  
##  NA's   :1        NA's   :1                                       
##       1990      
##  Min.   : 7.10  
##  1st Qu.: 8.20  
##  Median : 9.50  
##  Mean   :11.59  
##  3rd Qu.:10.65  
##  Max.   :99.00  
## 
head(tab)
## # A tibble: 6 x 21
##   State `2016` `2015` `2014` `2013` `2012` `2011` `2010` `2009` `2008` `2007`
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Alab~   7.15   7.35   7.81   7.82    8.2    8.4    8.2    8.3    8.6    8.9
## 2 Alas~   7.10   7.41   7.51   7.29    7.2    7.8    8      7.8    8.4    8.5
## 3 Ariz~   5.93   5.92   5.78   5.40    5.6    5.7    5.9    5.6    6      6.4
## 4 Arka~   9.86  10.0   10.1    9.75   10.9   10.4   10.8   10.7   10.6   12  
## 5 Cali~   6.46   6.18   6.44   6.46    6      5.8    5.8    5.8    6.7    6.2
## 6 Colo~   7.43   6.79   7.06   6.45    6.8    7      6.9    6.9    7.4    7.1
## # ... with 10 more variables: `2006` <dbl>, `2005` <dbl>, `2004` <dbl>,
## #   `2003` <dbl>, `2002` <dbl>, `2001` <dbl>, `2000` <dbl>, `1999` <dbl>,
## #   `1995` <dbl>, `1990` <dbl>
tail(tab)
## # A tibble: 6 x 21
##   State `2016` `2015` `2014` `2013` `2012` `2011` `2010` `2009` `2008` `2007`
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Verm~   8.30   8.12   8.70   9.16    8.2    8.3    9.3    8.7    7.9    8.5
## 2 Virg~   6.99   6.98   6.74   6.66    6.8    6.8    6.8    6.9    7.2    7.5
## 3 Wash~   6.24   6.22   6.96   7.11    6.3    6.1    6      6      6.3    6.4
## 4 West~   6.35   6.57   6.67   6.64    7      7.2    6.7    6.7    7.1    7.3
## 5 Wisc~   5.62   5.61   5.69   5.22    5.4    5.3    5.3    5.3    5.6    5.7
## 6 Wyom~   7.08   7.34   7.66   7.55    7.6    7.8    7.6    8      8.6    9  
## # ... with 10 more variables: `2006` <dbl>, `2005` <dbl>, `2004` <dbl>,
## #   `2003` <dbl>, `2002` <dbl>, `2001` <dbl>, `2000` <dbl>, `1999` <dbl>,
## #   `1995` <dbl>, `1990` <dbl>

The downloaded excel file did not contain census regions such as “Northeast”, “South”, “West”, “MidWest”. To provide a more indepth analysis, it is beneficial to associate states with their appropriate geographic region.

NE.name <- c("Connecticut","Maine","Massachusetts","New Hampshire",
             "Rhode Island","Vermont","New Jersey","New York",
             "Pennsylvania")
NE.abrv <- c("CT","ME","MA","NH","RI","VT","NJ","NY","PA")
NE.ref <- c(NE.name,NE.abrv)

MW.name <- c("Indiana","Illinois","Michigan","Ohio","Wisconsin",
             "Iowa","Kansas","Minnesota","Missouri","Nebraska",
             "North Dakota","South Dakota")
MW.abrv <- c("IN","IL","MI","OH","WI","IA","KS","MN","MO","NE",
             "ND","SD")
MW.ref <- c(MW.name,MW.abrv)

S.name <- c("Delaware","District of Columbia","Florida","Georgia",
            "Maryland","North Carolina","South Carolina","Virginia",
            "West Virginia","Alabama","Kentucky","Mississippi",
            "Tennessee","Arkansas","Louisiana","Oklahoma","Texas")
S.abrv <- c("DE","DC","FL","GA","MD","NC","SC","VA","WV","AL",
            "KY","MS","TN","AR","LA","OK","TX")
S.ref <- c(S.name,S.abrv)

W.name <- c("Arizona","Colorado","Idaho","New Mexico","Montana",
            "Utah","Nevada","Wyoming","Alaska","California",
            "Hawaii","Oregon","Washington")
W.abrv <- c("AZ","CO","ID","NM","MT","UT","NV","WY","AK","CA",
            "HI","OR","WA")
W.ref <- c(W.name,W.abrv)

region.list <- list(
  Northeast=NE.ref,
  Midwest=MW.ref,
  South=S.ref,
  West=W.ref)

tab$regions <- sapply(tab$State, 
                 function(x) names(region.list)[grep(x,region.list)])


newDf <- tab %>% select(State, regions, everything())

All values that should be num type are assigned correctly and states are character types, which makes further analysis possible.

Now we can also tidy up the data some more and convert the data set from wide to long.

States2 <- gather(newDf, "Years", "Rates", 3:22, na.rm = TRUE, -c(State))

States3 <- States2 %>% group_by(Years) %>% summarize(Avg_Rates = mean(Rates))
mrates3 <- States2 %>% group_by(Years) %>% summarize(Total_Rates = sum(Rates))
mrates3
## # A tibble: 20 x 2
##    Years Total_Rates
##    <chr>       <dbl>
##  1 1990         591 
##  2 1995         528.
##  3 1999         506.
##  4 2000         488.
##  5 2001         473.
##  6 2002         465.
##  7 2003         452.
##  8 2004         458.
##  9 2005         443.
## 10 2006         422.
## 11 2007         417.
## 12 2008         397 
## 13 2009         384.
## 14 2010         389.
## 15 2011         388.
## 16 2012         384.
## 17 2013         377.
## 18 2014         385.
## 19 2015         380.
## 20 2016         378.
States4 <- States2 %>% group_by(State) %>%summarize(Avg_Rates = mean(Rates))

States5 <- States2 %>% group_by(Years, regions) %>% summarize(Avg_Rates = mean(Rates))
States5
## # A tibble: 80 x 3
## # Groups:   Years [20]
##    Years regions   Avg_Rates
##    <chr> <chr>         <dbl>
##  1 1990  Midwest        8.8 
##  2 1990  Northeast      8.59
##  3 1990  South         10.8 
##  4 1990  West          17.3 
##  5 1995  Midwest        7.8 
##  6 1995  Northeast      7.67
##  7 1995  South          9.66
##  8 1995  West          15.4 
##  9 1999  Midwest        7.46
## 10 1999  Northeast      7.26
## # ... with 70 more rows
mrates<- States2 %>% group_by(regions) %>% summarize(Avg_Rates = mean(Rates))

Graphs

ggplot(mrates, aes(x=regions, y=Avg_Rates, colour=regions)) + geom_bar(stat = "identity")

library(knitr)
kable(States4)
State Avg_Rates
Alabama 8.936196
Alaska 8.155690
Arizona 6.666728
Arkansas 12.288216
California 6.307525
Colorado 7.566576
Connecticut 5.704987
Delaware 6.121035
District of Columbia 6.609112
Florida 8.458567
Georgia 7.103020
Hawaii 18.819632
Idaho 10.058690
Illinois 6.290073
Indiana 7.448972
Iowa 7.007969
Kansas 6.926694
Kentucky 8.678614
Louisiana 7.597224
Maine 8.030347
Maryland 6.779049
Massachusetts 5.974016
Michigan 6.184573
Minnesota 6.089528
Mississippi 6.362680
Missouri 7.141814
Montana 7.552989
Nebraska 6.972688
Nevada 53.812066
New Hampshire 7.605202
New Jersey 5.684738
New Mexico 7.093693
New York 7.079389
North Carolina 7.251488
North Dakota 6.663458
Ohio 6.581109
Oklahoma 7.353525
Oregon 7.200772
Pennsylvania 5.776922
Rhode Island 6.887332
South Carolina 8.724869
South Dakota 8.202020
Tennessee 10.973675
Texas 7.966002
Utah 9.319386
Vermont 9.128872
Virginia 7.953558
Washington 6.721071
West Virginia 7.172106
Wisconsin 6.016896
Wyoming 8.806495
ggplot(States5, aes(x=Years, y=Avg_Rates, colour = regions)) + geom_point()

ggplot(mrates3, aes(x=Years, y=Total_Rates)) + geom_point()

Conclusion

Viewing the data, there is a undeniable trend in regards to marriage rates. The rate of people getting married is trending downward, a topic that is being reported on with more frequency (here is one interesting article: https://www.theatlantic.com/magazine/archive/2020/03/the-nuclear-family-was-a-mistake/605536/)

Breaking the data down by regions, the rates are skewed. The West shows a high rate, but viewing the data by States, Nevada is skewing the data. Many people go to Vegas to get married but that doesnt mean they’re from Nevada or the West. However, the data still trends downward. Less people are getting married. What is worth further investigating which is mentioned in the article above, is the marriage rates across income levels and regions. Are wealthier folks in the northeast getting married at higher rates than those in the midwest? Are marriage rates for higher income groups trending up or down?