This contains data about the story Marriage isn’t Dead - Yet
The values show the proportion of the relevant population that is divorced.
I try to explore if divorce rates is related to the level of income.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cowplot) #To access plot_grid which combines multiple plots in a grid
library(ggthemes)
##
## Attaching package: 'ggthemes'
## The following object is masked from 'package:cowplot':
##
## theme_map
This reads the file from github into a tibble (dataframe) and check the head to have an idea of the data
url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/divorce.csv"
data_538 <- read.table(url, header = TRUE, sep = ",") #read the file into a tibble
head(data_538)
## X year date all_3544 HS_3544 SC_3544 BAp_3544 BAo_3544
## 1 1 1960 1960-01-01 0.03444454 0.03488887 0.03366938 0.02751277 0.02751277
## 2 2 1970 1970-01-01 0.04925600 0.04999984 0.04870549 0.04125945 0.04125945
## 3 3 1980 1980-01-01 0.10600503 0.10415128 0.11269947 0.09777208 0.09777208
## 4 4 1990 1990-01-01 0.15080495 0.15938095 0.16967005 0.11495533 0.11853432
## 5 5 2000 2000-01-01 0.15684099 0.17544616 0.17380324 0.10561772 0.11053969
## 6 6 2001 2001-01-01 0.15730639 0.17411604 0.17816906 0.10703886 0.11186064
## GD_3544 poor_3544 mid_3544 rich_3544 all_4554 HS_4554 SC_4554
## 1 NA 0.07195136 0.02122530 0.02256233 0.03965056 0.04007142 0.03870331
## 2 NA 0.11373582 0.03030104 0.02192707 0.05062562 0.05056576 0.05147270
## 3 NA 0.24164705 0.07367154 0.03501521 0.08818511 0.08566351 0.09500947
## 4 0.10914537 0.32694839 0.11190969 0.04972527 0.14572572 0.14077914 0.16633103
## 5 0.09590347 0.34308859 0.11658371 0.04939497 0.17958060 0.18477676 0.20110350
## 6 0.09719692 0.34283554 0.12278063 0.04057431 0.18265130 0.19023658 0.20573707
## BAp_4554 BAo_4554 GD_4554 poor_4554 mid_4554 rich_4554
## 1 0.03188568 0.03188568 NA 0.07537252 0.02933657 0.02350489
## 2 0.04831053 0.04831053 NA 0.11249624 0.03386947 0.02123280
## 3 0.09103488 0.09103488 NA 0.19867321 0.06300866 0.02791567
## 4 0.13436974 0.13566093 0.1327413 0.30661378 0.11498064 0.04577501
## 5 0.14904161 0.15369313 0.1425498 0.38213542 0.14224209 0.05152510
## 6 0.14863163 0.15326921 0.1417453 0.39003850 0.14651347 0.04522236
Explore the data set to see the names of each columns and determine if any needs to be changed
colnames(data_538)
## [1] "X" "year" "date" "all_3544" "HS_3544" "SC_3544"
## [7] "BAp_3544" "BAo_3544" "GD_3544" "poor_3544" "mid_3544" "rich_3544"
## [13] "all_4554" "HS_4554" "SC_4554" "BAp_4554" "BAo_4554" "GD_4554"
## [19] "poor_4554" "mid_4554" "rich_4554"
Use dplyr select function to subset the data by selecting only the columns for ages 35 to 44 and check the head to see the new subset
#Subset the data set to focus on individuals in the age range 35 - 44
data_age35to44 <- data_538 %>% select(year:rich_3544)
head(data_age35to44)
## year date all_3544 HS_3544 SC_3544 BAp_3544 BAo_3544
## 1 1960 1960-01-01 0.03444454 0.03488887 0.03366938 0.02751277 0.02751277
## 2 1970 1970-01-01 0.04925600 0.04999984 0.04870549 0.04125945 0.04125945
## 3 1980 1980-01-01 0.10600503 0.10415128 0.11269947 0.09777208 0.09777208
## 4 1990 1990-01-01 0.15080495 0.15938095 0.16967005 0.11495533 0.11853432
## 5 2000 2000-01-01 0.15684099 0.17544616 0.17380324 0.10561772 0.11053969
## 6 2001 2001-01-01 0.15730639 0.17411604 0.17816906 0.10703886 0.11186064
## GD_3544 poor_3544 mid_3544 rich_3544
## 1 NA 0.07195136 0.02122530 0.02256233
## 2 NA 0.11373582 0.03030104 0.02192707
## 3 NA 0.24164705 0.07367154 0.03501521
## 4 0.10914537 0.32694839 0.11190969 0.04972527
## 5 0.09590347 0.34308859 0.11658371 0.04939497
## 6 0.09719692 0.34283554 0.12278063 0.04057431
Check the column names for the new data subset
colnames(data_age35to44)
## [1] "year" "date" "all_3544" "HS_3544" "SC_3544" "BAp_3544"
## [7] "BAo_3544" "GD_3544" "poor_3544" "mid_3544" "rich_3544"
Rename the column names of the subset to more intuitive names
#Rename the columns to more intuitive names
new_colname <- c("year", "date", "all", "high_school", "some_college", "bachelor_or_more",
"bachelor_no_grad", "graduate_degree", "poor", "middle_class", "rich")
colnames(data_age35to44) <- new_colname
colnames(data_age35to44)
## [1] "year" "date" "all" "high_school"
## [5] "some_college" "bachelor_or_more" "bachelor_no_grad" "graduate_degree"
## [9] "poor" "middle_class" "rich"
Check the structure of the data subset. The data subset has 17 rows and 11 columns (variables)
str(data_age35to44)
## 'data.frame': 17 obs. of 11 variables:
## $ year : int 1960 1970 1980 1990 2000 2001 2002 2003 2004 2005 ...
## $ date : chr "1960-01-01" "1970-01-01" "1980-01-01" "1990-01-01" ...
## $ all : num 0.0344 0.0493 0.106 0.1508 0.1568 ...
## $ high_school : num 0.0349 0.05 0.1042 0.1594 0.1754 ...
## $ some_college : num 0.0337 0.0487 0.1127 0.1697 0.1738 ...
## $ bachelor_or_more: num 0.0275 0.0413 0.0978 0.115 0.1056 ...
## $ bachelor_no_grad: num 0.0275 0.0413 0.0978 0.1185 0.1105 ...
## $ graduate_degree : num NA NA NA 0.1091 0.0959 ...
## $ poor : num 0.072 0.114 0.242 0.327 0.343 ...
## $ middle_class : num 0.0212 0.0303 0.0737 0.1119 0.1166 ...
## $ rich : num 0.0226 0.0219 0.035 0.0497 0.0494 ...
Check the summary of the data to see the descriptive statistics of the data
summary(data_age35to44)
## year date all high_school
## Min. :1960 Length:17 Min. :0.03444 Min. :0.03489
## 1st Qu.:2000 Class :character 1st Qu.:0.15292 1st Qu.:0.17254
## Median :2004 Mode :character Median :0.15684 Median :0.17545
## Mean :1999 Mean :0.14190 Mean :0.16015
## 3rd Qu.:2008 3rd Qu.:0.16116 3rd Qu.:0.18838
## Max. :2012 Max. :0.16630 Max. :0.19240
##
## some_college bachelor_or_more bachelor_no_grad graduate_degree
## Min. :0.03367 Min. :0.02751 Min. :0.02751 Min. :0.08439
## 1st Qu.:0.17380 1st Qu.:0.10034 1st Qu.:0.10711 1st Qu.:0.08683
## Median :0.17817 Median :0.10306 Median :0.11086 Median :0.08981
## Mean :0.16332 Mean :0.09588 Mean :0.10182 Mean :0.09121
## 3rd Qu.:0.18945 3rd Qu.:0.10562 3rd Qu.:0.11186 3rd Qu.:0.09371
## Max. :0.20318 Max. :0.11496 Max. :0.11853 Max. :0.10915
## NA's :3
## poor middle_class rich
## Min. :0.07195 Min. :0.02123 Min. :0.02193
## 1st Qu.:0.32872 1st Qu.:0.11658 1st Qu.:0.03707
## Median :0.33156 Median :0.12221 Median :0.04939
## Mean :0.30022 Mean :0.10969 Mean :0.04591
## 3rd Qu.:0.33523 3rd Qu.:0.12472 3rd Qu.:0.05789
## Max. :0.34309 Max. :0.13836 Max. :0.06034
##
Divorce rate based on income
#plot divorce rates
g <- ggplot(data = data_age35to44, aes(year)) + labs(y = "Divorce rate")
pl_poor <- g + geom_smooth(aes(y = poor), se = FALSE) + labs(title = "Divorce rate: Poor") + theme_bw()
pl_middle <- g + geom_smooth(aes(y = middle_class), se = FALSE) + labs(title = "Divorce rate: Middle Class") + theme_bw()
pl_rich <- g + geom_smooth(aes(y = rich), se = FALSE) + labs(title = "Divorce rate: Rich") + theme_bw()
plot_grid(pl_poor, pl_middle, pl_rich)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Divorce rate based on education
g <- ggplot(data = data_age35to44, aes(year)) + labs(y = "Divorce rate")
pl_high_school <- g + geom_smooth(aes(y = high_school), se = FALSE) + labs(title = "High School") + theme_bw()
pl_some_college <- g + geom_smooth(aes(y = some_college), se = FALSE) + labs(title = "Some College") + theme_bw()
pl_bachelor_or_more <- g + geom_smooth(aes(y = bachelor_or_more), se = FALSE) + labs(title = "Bachelor or More") + theme_bw()
pl_bachelor_no_grad <- g + geom_smooth(aes(y = bachelor_no_grad), se = FALSE) + labs(title = "Bachelor no Grad") + theme_bw()
pl_graduate_degree <- g + geom_smooth(aes(y = graduate_degree), se = FALSE) + labs(title = "Graduate degree") + theme_bw()
plot_grid(pl_high_school, pl_some_college, pl_bachelor_or_more, pl_bachelor_no_grad ,pl_graduate_degree)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
Considering the mean values of divorce rates, it appears that the divorce rates of individuals in the data set available increases as income level decreases. This means that the data tends to show that poor people are more likely to get a divorce the more priviledged.