url <- "https://raw.githubusercontent.com/geeman1209/MSDATA2020/master/DATA607/Project%202/Dataset1/state_marriage_rates_90_95_99-16.xlsx"
library(httr)
#Solution to read xlsx files from github was found here: https://community.rstudio.com/t/read-xlsx-from-github/9386/4
#This solution was used for dataset 2 as well.
temp_file <- tempfile(fileext = ".xlsx")
req <- GET(url,
# authenticate using GITHUB_PAT
authenticate(Sys.getenv("GITHUB_PAT"), ""),
# write result to disk
write_disk(path = temp_file))
tab <- readxl::read_excel(temp_file)
#Get a view of the data
str(tab)
## Classes 'tbl_df', 'tbl' and 'data.frame': 51 obs. of 21 variables:
## $ State: chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ 2016 : num 7.15 7.1 5.93 9.86 6.46 ...
## $ 2015 : num 7.35 7.41 5.92 10.04 6.18 ...
## $ 2014 : num 7.81 7.51 5.78 10.11 6.44 ...
## $ 2013 : num 7.82 7.29 5.4 9.75 6.46 ...
## $ 2012 : num 8.2 7.2 5.6 10.9 6 6.8 5.2 5.8 8.4 7.2 ...
## $ 2011 : num 8.4 7.8 5.7 10.4 5.8 7 5.5 5.2 8.7 7.4 ...
## $ 2010 : num 8.2 8 5.9 10.8 5.8 6.9 5.6 5.2 7.6 7.3 ...
## $ 2009 : num 8.3 7.8 5.6 10.7 5.8 6.9 5.9 5.4 4.7 7.5 ...
## $ 2008 : num 8.6 8.4 6 10.6 6.7 7.4 5.4 5.5 4.1 8 ...
## $ 2007 : num 8.9 8.5 6.4 12 6.2 7.1 5.5 5.7 4.2 8.5 ...
## $ 2006 : num 9.2 8.2 6.5 12.4 6.3 7.2 5.5 5.9 4 8.6 ...
## $ 2005 : num 9.2 8.2 6.6 12.9 6.4 7.6 5.8 5.9 4.1 8.9 ...
## $ 2004 : num 9.4 8.5 6.7 13.4 6.4 7.4 5.8 6.1 5.2 9 ...
## $ 2003 : num 9.6 8.1 6.5 13.4 6.1 7.8 5.5 6 5.1 9 ...
## $ 2002 : num 9.9 8.3 6.7 14.3 6.2 8 5.7 6.4 5.1 9.4 ...
## $ 2001 : num 9.4 8.1 7.6 14.3 6.5 8.2 5.4 6.5 6.2 9.3 ...
## $ 2000 : num 10.1 8.9 7.5 15.4 5.8 8.3 5.7 6.5 4.9 8.9 ...
## $ 1999 : num 10.8 8.6 8.2 14.8 6.4 8.2 5.8 6.7 6.6 8.7 ...
## $ 1995 : num 9.8 9 8.8 14.4 6.3 9 6.6 7.3 6.1 9.9 ...
## $ 1990 : num 10.6 10.2 10 15.3 7.9 9.8 7.9 8.4 8.2 10.9 ...
names(tab)
## [1] "State" "2016" "2015" "2014" "2013" "2012" "2011" "2010" "2009"
## [10] "2008" "2007" "2006" "2005" "2004" "2003" "2002" "2001" "2000"
## [19] "1999" "1995" "1990"
summary(tab)
## State 2016 2015 2014
## Length:51 Min. : 5.357 Min. : 5.292 Min. : 5.369
## Class :character 1st Qu.: 6.128 1st Qu.: 6.172 1st Qu.: 6.329
## Mode :character Median : 6.852 Median : 6.857 Median : 6.893
## Mean : 7.418 Mean : 7.456 Mean : 7.700
## 3rd Qu.: 7.402 3rd Qu.: 7.413 3rd Qu.: 7.595
## Max. :28.392 Max. :31.018 Max. :31.851
## NA's :1
## 2013 2012 2011 2010
## Min. : 5.021 Min. : 4.900 Min. : 4.800 Min. : 4.90
## 1st Qu.: 6.305 1st Qu.: 5.800 1st Qu.: 5.850 1st Qu.: 5.80
## Median : 6.770 Median : 6.800 Median : 6.800 Median : 6.80
## Mean : 7.537 Mean : 7.527 Mean : 7.616 Mean : 7.62
## 3rd Qu.: 7.367 3rd Qu.: 7.350 3rd Qu.: 7.650 3rd Qu.: 7.40
## Max. :32.280 Max. :35.100 Max. :36.900 Max. :38.30
## NA's :1
## 2009 2008 2007 2006
## Min. : 4.700 Min. : 4.000 Min. : 4.200 Min. : 4.000
## 1st Qu.: 5.800 1st Qu.: 5.950 1st Qu.: 6.150 1st Qu.: 6.350
## Median : 6.600 Median : 6.800 Median : 7.000 Median : 7.100
## Mean : 7.535 Mean : 7.784 Mean : 8.182 Mean : 8.432
## 3rd Qu.: 7.400 3rd Qu.: 7.800 3rd Qu.: 7.800 3rd Qu.: 7.950
## Max. :40.300 Max. :42.300 Max. :48.600 Max. :52.100
## NA's :1
## 2005 2004 2003 2002
## Min. : 4.100 Min. : 5.200 Min. : 5.100 Min. : 5.100
## 1st Qu.: 6.500 1st Qu.: 6.500 1st Qu.: 6.500 1st Qu.: 6.525
## Median : 7.000 Median : 7.400 Median : 7.200 Median : 7.600
## Mean : 8.684 Mean : 8.984 Mean : 9.032 Mean : 9.294
## 3rd Qu.: 8.250 3rd Qu.: 8.450 3rd Qu.: 8.400 3rd Qu.: 8.750
## Max. :57.400 Max. :62.100 Max. :63.900 Max. :67.400
## NA's :1 NA's :1
## 2001 2000 1999 1995
## Min. : 5.400 Min. : 4.900 Min. : 5.80 Min. : 6.10
## 1st Qu.: 6.775 1st Qu.: 6.900 1st Qu.: 7.05 1st Qu.: 7.30
## Median : 7.600 Median : 7.850 Median : 7.90 Median : 8.40
## Mean : 9.468 Mean : 9.764 Mean : 9.92 Mean :10.34
## 3rd Qu.: 8.975 3rd Qu.: 9.400 3rd Qu.: 9.10 3rd Qu.: 9.90
## Max. :69.600 Max. :72.200 Max. :82.30 Max. :85.20
## NA's :1 NA's :1
## 1990
## Min. : 7.10
## 1st Qu.: 8.20
## Median : 9.50
## Mean :11.59
## 3rd Qu.:10.65
## Max. :99.00
##
head(tab)
## # A tibble: 6 x 21
## State `2016` `2015` `2014` `2013` `2012` `2011` `2010` `2009` `2008` `2007`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alab~ 7.15 7.35 7.81 7.82 8.2 8.4 8.2 8.3 8.6 8.9
## 2 Alas~ 7.10 7.41 7.51 7.29 7.2 7.8 8 7.8 8.4 8.5
## 3 Ariz~ 5.93 5.92 5.78 5.40 5.6 5.7 5.9 5.6 6 6.4
## 4 Arka~ 9.86 10.0 10.1 9.75 10.9 10.4 10.8 10.7 10.6 12
## 5 Cali~ 6.46 6.18 6.44 6.46 6 5.8 5.8 5.8 6.7 6.2
## 6 Colo~ 7.43 6.79 7.06 6.45 6.8 7 6.9 6.9 7.4 7.1
## # ... with 10 more variables: `2006` <dbl>, `2005` <dbl>, `2004` <dbl>,
## # `2003` <dbl>, `2002` <dbl>, `2001` <dbl>, `2000` <dbl>, `1999` <dbl>,
## # `1995` <dbl>, `1990` <dbl>
tail(tab)
## # A tibble: 6 x 21
## State `2016` `2015` `2014` `2013` `2012` `2011` `2010` `2009` `2008` `2007`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Verm~ 8.30 8.12 8.70 9.16 8.2 8.3 9.3 8.7 7.9 8.5
## 2 Virg~ 6.99 6.98 6.74 6.66 6.8 6.8 6.8 6.9 7.2 7.5
## 3 Wash~ 6.24 6.22 6.96 7.11 6.3 6.1 6 6 6.3 6.4
## 4 West~ 6.35 6.57 6.67 6.64 7 7.2 6.7 6.7 7.1 7.3
## 5 Wisc~ 5.62 5.61 5.69 5.22 5.4 5.3 5.3 5.3 5.6 5.7
## 6 Wyom~ 7.08 7.34 7.66 7.55 7.6 7.8 7.6 8 8.6 9
## # ... with 10 more variables: `2006` <dbl>, `2005` <dbl>, `2004` <dbl>,
## # `2003` <dbl>, `2002` <dbl>, `2001` <dbl>, `2000` <dbl>, `1999` <dbl>,
## # `1995` <dbl>, `1990` <dbl>
The downloaded excel file did not contain census regions such as “Northeast”, “South”, “West”, “MidWest”. To provide a more indepth analysis, it is beneficial to associate states with their appropriate geographic region.
NE.name <- c("Connecticut","Maine","Massachusetts","New Hampshire",
"Rhode Island","Vermont","New Jersey","New York",
"Pennsylvania")
NE.abrv <- c("CT","ME","MA","NH","RI","VT","NJ","NY","PA")
NE.ref <- c(NE.name,NE.abrv)
MW.name <- c("Indiana","Illinois","Michigan","Ohio","Wisconsin",
"Iowa","Kansas","Minnesota","Missouri","Nebraska",
"North Dakota","South Dakota")
MW.abrv <- c("IN","IL","MI","OH","WI","IA","KS","MN","MO","NE",
"ND","SD")
MW.ref <- c(MW.name,MW.abrv)
S.name <- c("Delaware","District of Columbia","Florida","Georgia",
"Maryland","North Carolina","South Carolina","Virginia",
"West Virginia","Alabama","Kentucky","Mississippi",
"Tennessee","Arkansas","Louisiana","Oklahoma","Texas")
S.abrv <- c("DE","DC","FL","GA","MD","NC","SC","VA","WV","AL",
"KY","MS","TN","AR","LA","OK","TX")
S.ref <- c(S.name,S.abrv)
W.name <- c("Arizona","Colorado","Idaho","New Mexico","Montana",
"Utah","Nevada","Wyoming","Alaska","California",
"Hawaii","Oregon","Washington")
W.abrv <- c("AZ","CO","ID","NM","MT","UT","NV","WY","AK","CA",
"HI","OR","WA")
W.ref <- c(W.name,W.abrv)
region.list <- list(
Northeast=NE.ref,
Midwest=MW.ref,
South=S.ref,
West=W.ref)
tab$regions <- sapply(tab$State,
function(x) names(region.list)[grep(x,region.list)])
newDf <- tab %>% select(State, regions, everything())
All values that should be num type are assigned correctly and states are character types, which makes further analysis possible.
Now we can also tidy up the data some more and convert the data set from wide to long.
States2 <- gather(newDf, "Years", "Rates", 3:22, na.rm = TRUE, -c(State))
States3 <- States2 %>% group_by(Years) %>% summarize(Avg_Rates = mean(Rates))
mrates3 <- States2 %>% group_by(Years) %>% summarize(Total_Rates = sum(Rates))
mrates3
## # A tibble: 20 x 2
## Years Total_Rates
## <chr> <dbl>
## 1 1990 591
## 2 1995 528.
## 3 1999 506.
## 4 2000 488.
## 5 2001 473.
## 6 2002 465.
## 7 2003 452.
## 8 2004 458.
## 9 2005 443.
## 10 2006 422.
## 11 2007 417.
## 12 2008 397
## 13 2009 384.
## 14 2010 389.
## 15 2011 388.
## 16 2012 384.
## 17 2013 377.
## 18 2014 385.
## 19 2015 380.
## 20 2016 378.
States4 <- States2 %>% group_by(State) %>%summarize(Avg_Rates = mean(Rates))
States5 <- States2 %>% group_by(Years, regions) %>% summarize(Avg_Rates = mean(Rates))
States5
## # A tibble: 80 x 3
## # Groups: Years [20]
## Years regions Avg_Rates
## <chr> <chr> <dbl>
## 1 1990 Midwest 8.8
## 2 1990 Northeast 8.59
## 3 1990 South 10.8
## 4 1990 West 17.3
## 5 1995 Midwest 7.8
## 6 1995 Northeast 7.67
## 7 1995 South 9.66
## 8 1995 West 15.4
## 9 1999 Midwest 7.46
## 10 1999 Northeast 7.26
## # ... with 70 more rows
mrates<- States2 %>% group_by(regions) %>% summarize(Avg_Rates = mean(Rates))
ggplot(mrates, aes(x=regions, y=Avg_Rates, colour=regions)) + geom_bar(stat = "identity")
library(knitr)
kable(States4)
| State | Avg_Rates |
|---|---|
| Alabama | 8.936196 |
| Alaska | 8.155690 |
| Arizona | 6.666728 |
| Arkansas | 12.288216 |
| California | 6.307525 |
| Colorado | 7.566576 |
| Connecticut | 5.704987 |
| Delaware | 6.121035 |
| District of Columbia | 6.609112 |
| Florida | 8.458567 |
| Georgia | 7.103020 |
| Hawaii | 18.819632 |
| Idaho | 10.058690 |
| Illinois | 6.290073 |
| Indiana | 7.448972 |
| Iowa | 7.007969 |
| Kansas | 6.926694 |
| Kentucky | 8.678614 |
| Louisiana | 7.597224 |
| Maine | 8.030347 |
| Maryland | 6.779049 |
| Massachusetts | 5.974016 |
| Michigan | 6.184573 |
| Minnesota | 6.089528 |
| Mississippi | 6.362680 |
| Missouri | 7.141814 |
| Montana | 7.552989 |
| Nebraska | 6.972688 |
| Nevada | 53.812066 |
| New Hampshire | 7.605202 |
| New Jersey | 5.684738 |
| New Mexico | 7.093693 |
| New York | 7.079389 |
| North Carolina | 7.251488 |
| North Dakota | 6.663458 |
| Ohio | 6.581109 |
| Oklahoma | 7.353525 |
| Oregon | 7.200772 |
| Pennsylvania | 5.776922 |
| Rhode Island | 6.887332 |
| South Carolina | 8.724869 |
| South Dakota | 8.202020 |
| Tennessee | 10.973675 |
| Texas | 7.966002 |
| Utah | 9.319386 |
| Vermont | 9.128872 |
| Virginia | 7.953558 |
| Washington | 6.721071 |
| West Virginia | 7.172106 |
| Wisconsin | 6.016896 |
| Wyoming | 8.806495 |
ggplot(States5, aes(x=Years, y=Avg_Rates, colour = regions)) + geom_point()
ggplot(mrates3, aes(x=Years, y=Total_Rates)) + geom_point()
Viewing the data, there is a undeniable trend in regards to marriage rates. The rate of people getting married is trending downward, a topic that is being reported on with more frequency (here is one interesting article: https://www.theatlantic.com/magazine/archive/2020/03/the-nuclear-family-was-a-mistake/605536/)
Breaking the data down by regions, the rates are skewed. The West shows a high rate, but viewing the data by States, Nevada is skewing the data. Many people go to Vegas to get married but that doesnt mean they’re from Nevada or the West. However, the data still trends downward. Less people are getting married. What is worth further investigating which is mentioned in the article above, is the marriage rates across income levels and regions. Are wealthier folks in the northeast getting married at higher rates than those in the midwest? Are marriage rates for higher income groups trending up or down?