“Obtained from FiveThirtyEight, many of the columns are the drug names and they could become row objects. The table is based on the age of the user and it would be better to read if the ages read across the table as column headers.
The readme file is on the page prior.
https://github.com/fivethirtyeight/data/blob/master/drug-use-by-age/drug-use-by-age.csv"
In order to analyse Drug use by age data set I performed the following steps:
drugs <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/drug-use-by-age/drug-use-by-age.csv")
head(drugs)
## age n alcohol.use alcohol.frequency marijuana.use marijuana.frequency
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28.0 36
## cocaine.use cocaine.frequency crack.use crack.frequency heroin.use
## 1 0.1 5.0 0.0 - 0.1
## 2 0.1 1.0 0.0 3.0 0.0
## 3 0.1 5.5 0.0 - 0.1
## 4 0.5 4.0 0.1 9.5 0.2
## 5 1.0 7.0 0.0 1.0 0.1
## 6 2.0 5.0 0.1 21.0 0.1
## heroin.frequency hallucinogen.use hallucinogen.frequency inhalant.use
## 1 35.5 0.2 52 1.6
## 2 - 0.6 6 2.5
## 3 2.0 1.6 3 2.6
## 4 1.0 2.1 4 2.5
## 5 66.5 3.4 3 3.0
## 6 64.0 4.8 3 2.0
## inhalant.frequency pain.releiver.use pain.releiver.frequency
## 1 19.0 2.0 36
## 2 12.0 2.4 14
## 3 5.0 3.9 12
## 4 5.5 5.5 10
## 5 3.0 6.2 7
## 6 4.0 8.5 9
## oxycontin.use oxycontin.frequency tranquilizer.use
## 1 0.1 24.5 0.2
## 2 0.1 41.0 0.3
## 3 0.4 4.5 0.9
## 4 0.8 3.0 2.0
## 5 1.1 4.0 2.4
## 6 1.4 6.0 3.5
## tranquilizer.frequency stimulant.use stimulant.frequency meth.use
## 1 52.0 0.2 2.0 0.0
## 2 25.5 0.3 4.0 0.1
## 3 5.0 0.8 12.0 0.1
## 4 4.5 1.5 6.0 0.3
## 5 11.0 1.8 9.5 0.3
## 6 7.0 2.8 9.0 0.6
## meth.frequency sedative.use sedative.frequency
## 1 - 0.2 13.0
## 2 5.0 0.1 19.0
## 3 24.0 0.2 16.5
## 4 10.5 0.4 30.0
## 5 36.0 0.2 3.0
## 6 48.0 0.5 6.5
dim(drugs)
## [1] 17 28
colnames(drugs)
## [1] "age" "n"
## [3] "alcohol.use" "alcohol.frequency"
## [5] "marijuana.use" "marijuana.frequency"
## [7] "cocaine.use" "cocaine.frequency"
## [9] "crack.use" "crack.frequency"
## [11] "heroin.use" "heroin.frequency"
## [13] "hallucinogen.use" "hallucinogen.frequency"
## [15] "inhalant.use" "inhalant.frequency"
## [17] "pain.releiver.use" "pain.releiver.frequency"
## [19] "oxycontin.use" "oxycontin.frequency"
## [21] "tranquilizer.use" "tranquilizer.frequency"
## [23] "stimulant.use" "stimulant.frequency"
## [25] "meth.use" "meth.frequency"
## [27] "sedative.use" "sedative.frequency"
drugs <- drugs %>% mutate_at(vars(-age),funs(as.numeric(.)))
head(drugs)
## age n alcohol.use alcohol.frequency marijuana.use marijuana.frequency
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28.0 36
## cocaine.use cocaine.frequency crack.use crack.frequency heroin.use
## 1 0.1 6 0.0 1 0.1
## 2 0.1 2 0.0 8 0.0
## 3 0.1 7 0.0 1 0.1
## 4 0.5 5 0.1 13 0.2
## 5 1.0 9 0.0 2 0.1
## 6 2.0 6 0.1 7 0.1
## heroin.frequency hallucinogen.use hallucinogen.frequency inhalant.use
## 1 8 0.2 52 1.6
## 2 1 0.6 6 2.5
## 3 5 1.6 3 2.6
## 4 2 2.1 4 2.5
## 5 16 3.4 3 3.0
## 6 14 4.8 3 2.0
## inhalant.frequency pain.releiver.use pain.releiver.frequency
## 1 5 2.0 36
## 2 3 2.4 14
## 3 10 3.9 12
## 4 11 5.5 10
## 5 7 6.2 7
## 6 9 8.5 9
## oxycontin.use oxycontin.frequency tranquilizer.use
## 1 0.1 6 0.2
## 2 0.1 10 0.3
## 3 0.4 9 0.9
## 4 0.8 7 2.0
## 5 1.1 8 2.4
## 6 1.4 13 3.5
## tranquilizer.frequency stimulant.use stimulant.frequency meth.use
## 1 52.0 0.2 2.0 0.0
## 2 25.5 0.3 4.0 0.1
## 3 5.0 0.8 12.0 0.1
## 4 4.5 1.5 6.0 0.3
## 5 11.0 1.8 9.5 0.3
## 6 7.0 2.8 9.0 0.6
## meth.frequency sedative.use sedative.frequency
## 1 1 0.2 13.0
## 2 13 0.1 19.0
## 3 8 0.2 16.5
## 4 2 0.4 30.0
## 5 10 0.2 3.0
## 6 12 0.5 6.5
drugs_long <- drugs %>% gather(drug, percentage, colnames(drugs)[3]:colnames(drugs)[28])
head(drugs_long)
## age n drug percentage
## 1 12 2798 alcohol.use 3.9
## 2 13 2757 alcohol.use 8.5
## 3 14 2792 alcohol.use 18.1
## 4 15 2956 alcohol.use 29.2
## 5 16 3058 alcohol.use 40.1
## 6 17 3038 alcohol.use 49.3
drugs_long <- drugs_long %>% mutate(drug = str_replace(drug, "pain.releiver", "pain releiver"))
head(drugs_long)
## age n drug percentage
## 1 12 2798 alcohol.use 3.9
## 2 13 2757 alcohol.use 8.5
## 3 14 2792 alcohol.use 18.1
## 4 15 2956 alcohol.use 29.2
## 5 16 3058 alcohol.use 40.1
## 6 17 3038 alcohol.use 49.3
drugs_long <- drugs_long %>% separate(drug, c("drug_name", "use_frequency"), sep = "\\.")
head(drugs_long)
## age n drug_name use_frequency percentage
## 1 12 2798 alcohol use 3.9
## 2 13 2757 alcohol use 8.5
## 3 14 2792 alcohol use 18.1
## 4 15 2956 alcohol use 29.2
## 5 16 3058 alcohol use 40.1
## 6 17 3038 alcohol use 49.3
drugs_wide <- drugs_long %>% spread(use_frequency, percentage)
head(drugs_wide)
## age n drug_name frequency use
## 1 12 2798 alcohol 3 3.9
## 2 12 2798 cocaine 6 0.1
## 3 12 2798 crack 1 0.0
## 4 12 2798 hallucinogen 52 0.2
## 5 12 2798 heroin 8 0.1
## 6 12 2798 inhalant 5 1.6
group_by(drugs_wide, age) %>% summarise(mean_usage = mean(use),median_usage = median(use), sd_usage = sd(use))
## # A tibble: 17 x 4
## age mean_usage median_usage sd_usage
## <fctr> <dbl> <dbl> <dbl>
## 1 12 0.7461538 0.2 1.152255
## 2 13 1.4153846 0.3 2.421379
## 3 14 2.8846154 0.8 5.174754
## 4 15 4.5846154 1.5 8.356619
## 5 16 6.3153846 1.8 11.783169
## 6 17 7.9692308 2.0 14.476555
## 7 18 9.6076923 3.0 17.236418
## 8 19 10.1692308 3.3 18.612200
## 9 20 10.8846154 4.0 19.801551
## 10 21 11.4615385 3.9 23.244947
## 11 22-23 11.1846154 3.6 23.193600
## 12 24-25 10.5076923 2.6 22.796873
## 13 26-29 9.7307692 2.3 22.040356
## 14 30-34 8.5923077 1.4 21.163115
## 15 35-49 7.3769231 0.6 20.515244
## 16 50-64 6.2615385 0.4 18.414421
## 17 65+ 3.9538462 0.0 13.629357
group_by(drugs_wide, age) %>% filter(use == min(use)) %>% arrange(age) %>% select (age,drug_name,use)
## # A tibble: 33 x 3
## # Groups: age [17]
## age drug_name use
## <fctr> <chr> <dbl>
## 1 12 crack 0.0
## 2 12 meth 0.0
## 3 13 crack 0.0
## 4 13 heroin 0.0
## 5 14 crack 0.0
## 6 15 crack 0.1
## 7 16 crack 0.0
## 8 17 crack 0.1
## 9 17 heroin 0.1
## 10 18 crack 0.4
## # ... with 23 more rows
group_by(drugs_wide, age) %>% filter(use == max(use)) %>% arrange(age) %>% select (age,drug_name,use)
## # A tibble: 17 x 3
## # Groups: age [17]
## age drug_name use
## <fctr> <chr> <dbl>
## 1 12 alcohol 3.9
## 2 13 alcohol 8.5
## 3 14 alcohol 18.1
## 4 15 alcohol 29.2
## 5 16 alcohol 40.1
## 6 17 alcohol 49.3
## 7 18 alcohol 58.7
## 8 19 alcohol 64.6
## 9 20 alcohol 69.7
## 10 21 alcohol 83.2
## 11 22-23 alcohol 84.2
## 12 24-25 alcohol 83.1
## 13 26-29 alcohol 80.7
## 14 30-34 alcohol 77.5
## 15 35-49 alcohol 75.0
## 16 50-64 alcohol 67.2
## 17 65+ alcohol 49.3
group_by(drugs_wide, drug_name) %>% filter(use == min(use)) %>% arrange(drug_name) %>% select (drug_name, age, use)
## # A tibble: 20 x 3
## # Groups: drug_name [13]
## drug_name age use
## <chr> <fctr> <dbl>
## 1 alcohol 12 3.9
## 2 cocaine 65+ 0.0
## 3 crack 12 0.0
## 4 crack 13 0.0
## 5 crack 14 0.0
## 6 crack 16 0.0
## 7 crack 65+ 0.0
## 8 hallucinogen 65+ 0.1
## 9 heroin 13 0.0
## 10 heroin 65+ 0.0
## 11 inhalant 65+ 0.0
## 12 marijuana 12 1.1
## 13 meth 12 0.0
## 14 meth 65+ 0.0
## 15 oxycontin 65+ 0.0
## 16 pain releiver 65+ 0.6
## 17 sedative 65+ 0.0
## 18 stimulant 65+ 0.0
## 19 tranquilizer 12 0.2
## 20 tranquilizer 65+ 0.2
group_by(drugs_wide, drug_name) %>% filter(use == max(use)) %>% arrange(drug_name) %>% select (drug_name, age, use)
## # A tibble: 17 x 3
## # Groups: drug_name [13]
## drug_name age use
## <chr> <fctr> <dbl>
## 1 alcohol 22-23 84.2
## 2 cocaine 20 4.9
## 3 crack 20 0.6
## 4 hallucinogen 19 8.6
## 5 heroin 22-23 1.1
## 6 inhalant 16 3.0
## 7 marijuana 20 34.0
## 8 meth 20 0.9
## 9 oxycontin 18 1.7
## 10 oxycontin 20 1.7
## 11 oxycontin 22-23 1.7
## 12 pain releiver 20 10.0
## 13 pain releiver 22-23 10.0
## 14 sedative 17 0.5
## 15 sedative 20 0.5
## 16 stimulant 21 4.1
## 17 tranquilizer 20 5.4
group_by(drugs_wide, drug_name) %>% summarise(sum_usage=sum(use)) %>% arrange(desc(sum_usage))
## # A tibble: 13 x 2
## drug_name sum_usage
## <chr> <dbl>
## 1 alcohol 942.3
## 2 marijuana 321.7
## 3 pain releiver 106.6
## 4 hallucinogen 57.7
## 5 tranquilizer 47.7
## 6 cocaine 37.0
## 7 stimulant 32.6
## 8 inhalant 23.6
## 9 oxycontin 15.9
## 10 meth 6.5
## 11 heroin 6.0
## 12 crack 5.0
## 13 sedative 4.8
“I started practicing yoga about a year ago, and realize I’m late to the game. I found this data set about interest in yoga on data.world. I believe it is pulled from Google trends. It is showing the interest in yoga by month by year by state. I believe it is a”wide" table, which might be more easy to perform data analysis by making “long.”
After tidying the data, I would be interested to know when people were most interested in yoga in the last 10 years. And where in the US has there been the most interest in yoga (I would guess California)? Could this analysis be a proxy to measure the popularity of yoga by state in the US?
yoga <- read.csv("https://query.data.world/s/pFEtecI1n6Xrcdft_g4-GSDe8IBMKI")
head(yoga)
## X
## 1 Values show search interest per month in yoga and have been indexed to 100, where 100 is the maximum value.
## 2 2004-01
## 3 2004-02
## 4 2004-03
## 5 2004-04
## 6 2004-05
## Alabama..us.al. Alaska..us.ak. Arizona..us.az. Arkansas..us.ar.
## 1 NA NA NA NA
## 2 20 23 21 24
## 3 8 26 25 16
## 4 10 26 22 26
## 5 15 34 24 18
## 6 15 14 24 11
## California..us.ca. Colorado..us.co. Connecticut..us.ct. Delaware..us.de.
## 1 NA NA NA NA
## 2 32 33 27 47
## 3 27 30 26 28
## 4 28 29 30 51
## 5 25 27 22 27
## 6 25 24 25 20
## District.of.Columbia..us.dc. Florida..us.fl. Georgia..us.ga.
## 1 NA NA NA
## 2 32 21 21
## 3 36 17 20
## 4 29 17 20
## 5 29 19 15
## 6 24 20 16
## Hawaii..us.hi. Idaho..us.id. Illinois..us.il. Indiana..us.in.
## 1 NA NA NA NA
## 2 36 21 25 24
## 3 24 22 23 14
## 4 36 21 25 17
## 5 30 18 17 14
## 6 39 14 17 9
## Iowa..us.ia. Kansas..us.ks. Kentucky..us.ky. Louisiana..us.la.
## 1 NA NA NA NA
## 2 14 20 17 20
## 3 16 12 19 15
## 4 18 13 18 17
## 5 19 19 12 20
## 6 14 15 15 14
## Maine..us.me. Maryland..us.md. Massachusetts..us.ma. Michigan..us.mi.
## 1 NA NA NA NA
## 2 29 26 41 19
## 3 29 23 33 18
## 4 26 22 32 17
## 5 21 17 31 15
## 6 25 21 27 15
## Minnesota..us.mn. Mississippi..us.ms. Missouri..us.mo. Montana..us.mt.
## 1 NA NA NA NA
## 2 26 16 19 44
## 3 22 20 18 26
## 4 21 18 15 41
## 5 17 18 12 25
## 6 22 13 15 24
## Nebraska..us.ne. Nevada..us.nv. New.Hampshire..us.nh. New.Jersey..us.nj.
## 1 NA NA NA NA
## 2 15 20 45 27
## 3 21 25 20 22
## 4 16 24 22 23
## 5 16 17 26 20
## 6 14 25 23 22
## New.Mexico..us.nm. New.York..us.ny. North.Carolina..us.nc.
## 1 NA NA NA
## 2 33 35 23
## 3 25 28 22
## 4 26 29 20
## 5 18 26 18
## 6 21 28 16
## North.Dakota..us.nd. Ohio..us.oh. Oklahoma..us.ok. Oregon..us.or.
## 1 NA NA NA NA
## 2 52 19 22 34
## 3 45 16 19 30
## 4 45 15 12 28
## 5 45 13 11 22
## 6 45 16 17 25
## Pennsylvania..us.pa. Rhode.Island..us.ri. South.Carolina..us.sc.
## 1 NA NA NA
## 2 19 44 24
## 3 18 26 19
## 4 20 27 18
## 5 19 31 14
## 6 16 26 11
## South.Dakota..us.sd. Tennessee..us.tn. Texas..us.tx. Utah..us.ut.
## 1 NA NA NA NA
## 2 25 21 24 26
## 3 22 18 16 20
## 4 21 16 17 10
## 5 21 21 16 20
## 6 28 15 17 19
## Vermont..us.vt. Virginia..us.va. Washington..us.wa.
## 1 NA NA NA
## 2 42 22 30
## 3 39 16 29
## 4 41 19 27
## 5 37 17 25
## 6 31 18 25
## West.Virginia..us.wv. Wisconsin..us.wi. Wyoming..us.wy.
## 1 NA NA NA
## 2 23 18 0
## 3 17 17 37
## 4 27 20 35
## 5 26 17 37
## 6 21 18 35
yoga <- yoga %>% filter(!is.na(yoga[2]))
head(yoga)
## X Alabama..us.al. Alaska..us.ak. Arizona..us.az. Arkansas..us.ar.
## 1 2004-01 20 23 21 24
## 2 2004-02 8 26 25 16
## 3 2004-03 10 26 22 26
## 4 2004-04 15 34 24 18
## 5 2004-05 15 14 24 11
## 6 2004-06 12 21 23 14
## California..us.ca. Colorado..us.co. Connecticut..us.ct. Delaware..us.de.
## 1 32 33 27 47
## 2 27 30 26 28
## 3 28 29 30 51
## 4 25 27 22 27
## 5 25 24 25 20
## 6 27 27 26 22
## District.of.Columbia..us.dc. Florida..us.fl. Georgia..us.ga.
## 1 32 21 21
## 2 36 17 20
## 3 29 17 20
## 4 29 19 15
## 5 24 20 16
## 6 26 18 18
## Hawaii..us.hi. Idaho..us.id. Illinois..us.il. Indiana..us.in.
## 1 36 21 25 24
## 2 24 22 23 14
## 3 36 21 25 17
## 4 30 18 17 14
## 5 39 14 17 9
## 6 44 17 20 18
## Iowa..us.ia. Kansas..us.ks. Kentucky..us.ky. Louisiana..us.la.
## 1 14 20 17 20
## 2 16 12 19 15
## 3 18 13 18 17
## 4 19 19 12 20
## 5 14 15 15 14
## 6 17 18 17 16
## Maine..us.me. Maryland..us.md. Massachusetts..us.ma. Michigan..us.mi.
## 1 29 26 41 19
## 2 29 23 33 18
## 3 26 22 32 17
## 4 21 17 31 15
## 5 25 21 27 15
## 6 22 23 29 17
## Minnesota..us.mn. Mississippi..us.ms. Missouri..us.mo. Montana..us.mt.
## 1 26 16 19 44
## 2 22 20 18 26
## 3 21 18 15 41
## 4 17 18 12 25
## 5 22 13 15 24
## 6 18 18 20 21
## Nebraska..us.ne. Nevada..us.nv. New.Hampshire..us.nh. New.Jersey..us.nj.
## 1 15 20 45 27
## 2 21 25 20 22
## 3 16 24 22 23
## 4 16 17 26 20
## 5 14 25 23 22
## 6 15 23 27 21
## New.Mexico..us.nm. New.York..us.ny. North.Carolina..us.nc.
## 1 33 35 23
## 2 25 28 22
## 3 26 29 20
## 4 18 26 18
## 5 21 28 16
## 6 32 28 22
## North.Dakota..us.nd. Ohio..us.oh. Oklahoma..us.ok. Oregon..us.or.
## 1 52 19 22 34
## 2 45 16 19 30
## 3 45 15 12 28
## 4 45 13 11 22
## 5 45 16 17 25
## 6 45 16 17 27
## Pennsylvania..us.pa. Rhode.Island..us.ri. South.Carolina..us.sc.
## 1 19 44 24
## 2 18 26 19
## 3 20 27 18
## 4 19 31 14
## 5 16 26 11
## 6 20 33 19
## South.Dakota..us.sd. Tennessee..us.tn. Texas..us.tx. Utah..us.ut.
## 1 25 21 24 26
## 2 22 18 16 20
## 3 21 16 17 10
## 4 21 21 16 20
## 5 28 15 17 19
## 6 28 16 17 16
## Vermont..us.vt. Virginia..us.va. Washington..us.wa.
## 1 42 22 30
## 2 39 16 29
## 3 41 19 27
## 4 37 17 25
## 5 31 18 25
## 6 34 22 27
## West.Virginia..us.wv. Wisconsin..us.wi. Wyoming..us.wy.
## 1 23 18 0
## 2 17 17 37
## 3 27 20 35
## 4 26 17 37
## 5 21 18 35
## 6 28 19 35
dim(yoga)
## [1] 148 52
colnames(yoga)
## [1] "X" "Alabama..us.al."
## [3] "Alaska..us.ak." "Arizona..us.az."
## [5] "Arkansas..us.ar." "California..us.ca."
## [7] "Colorado..us.co." "Connecticut..us.ct."
## [9] "Delaware..us.de." "District.of.Columbia..us.dc."
## [11] "Florida..us.fl." "Georgia..us.ga."
## [13] "Hawaii..us.hi." "Idaho..us.id."
## [15] "Illinois..us.il." "Indiana..us.in."
## [17] "Iowa..us.ia." "Kansas..us.ks."
## [19] "Kentucky..us.ky." "Louisiana..us.la."
## [21] "Maine..us.me." "Maryland..us.md."
## [23] "Massachusetts..us.ma." "Michigan..us.mi."
## [25] "Minnesota..us.mn." "Mississippi..us.ms."
## [27] "Missouri..us.mo." "Montana..us.mt."
## [29] "Nebraska..us.ne." "Nevada..us.nv."
## [31] "New.Hampshire..us.nh." "New.Jersey..us.nj."
## [33] "New.Mexico..us.nm." "New.York..us.ny."
## [35] "North.Carolina..us.nc." "North.Dakota..us.nd."
## [37] "Ohio..us.oh." "Oklahoma..us.ok."
## [39] "Oregon..us.or." "Pennsylvania..us.pa."
## [41] "Rhode.Island..us.ri." "South.Carolina..us.sc."
## [43] "South.Dakota..us.sd." "Tennessee..us.tn."
## [45] "Texas..us.tx." "Utah..us.ut."
## [47] "Vermont..us.vt." "Virginia..us.va."
## [49] "Washington..us.wa." "West.Virginia..us.wv."
## [51] "Wisconsin..us.wi." "Wyoming..us.wy."
yoga_long <- yoga %>% gather(state, number, colnames(yoga)[2]:colnames(yoga)[52])
head(yoga_long)
## X state number
## 1 2004-01 Alabama..us.al. 20
## 2 2004-02 Alabama..us.al. 8
## 3 2004-03 Alabama..us.al. 10
## 4 2004-04 Alabama..us.al. 15
## 5 2004-05 Alabama..us.al. 15
## 6 2004-06 Alabama..us.al. 12
yoga_long <- yoga_long %>% rename(date = X)
head(yoga_long)
## date state number
## 1 2004-01 Alabama..us.al. 20
## 2 2004-02 Alabama..us.al. 8
## 3 2004-03 Alabama..us.al. 10
## 4 2004-04 Alabama..us.al. 15
## 5 2004-05 Alabama..us.al. 15
## 6 2004-06 Alabama..us.al. 12
yoga_long <- yoga_long %>% separate(state, c("state","country_state"), sep = "\\.\\.") %>% select(-country_state)
head(yoga_long)
## date state number
## 1 2004-01 Alabama 20
## 2 2004-02 Alabama 8
## 3 2004-03 Alabama 10
## 4 2004-04 Alabama 15
## 5 2004-05 Alabama 15
## 6 2004-06 Alabama 12
group_by(yoga_long, state) %>% summarise(sum=sum(number)) %>% arrange(desc(sum))
## # A tibble: 51 x 2
## state sum
## <chr> <int>
## 1 Vermont 7529
## 2 Hawaii 5699
## 3 Rhode.Island 5546
## 4 New.Hampshire 5292
## 5 Maine 5251
## 6 Montana 4976
## 7 Wyoming 4815
## 8 Massachusetts 4763
## 9 Alaska 4726
## 10 Colorado 4578
## # ... with 41 more rows
group_by(yoga_long, date) %>% summarise(sum=sum(number)) %>% arrange(desc(sum))
## # A tibble: 148 x 2
## date sum
## <fctr> <int>
## 1 2016-01 1803
## 2 2015-01 1766
## 3 2014-01 1652
## 4 2015-07 1644
## 5 2015-06 1639
## 6 2016-02 1639
## 7 2015-08 1605
## 8 2014-07 1563
## 9 2015-03 1561
## 10 2015-02 1550
## # ... with 138 more rows
" While looking at fivethirtyeight’s data on github, I came across one particularly wide dataset, found below. It details marriage rates for a variety of demographics. However, the demographics are demonstrated by column names. Five example column headers are: HS_2534, HS_3544, HS_4554, nokids_HS_2534, kids_HS_2534.
These column titles mean: High School Graduate aged 25-34, High School Graduate 35-44, High School Graduate 45-54, High School Graduate without kids 25-34, and High School Graduate without kids 25-34. All of this data is lost in having it as a column name, and so column headers are values that describe demographics such as education level, age, and whether or not they have children – they are not variables.
https://github.com/fivethirtyeight/data/blob/master/marriage/both_sexes.csv "
In order to analyse Marriage Rates data set I performed the following steps:
marriage <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/both_sexes.csv")
head(marriage)
## X year date all_2534 HS_2534 SC_2534 BAp_2534 BAo_2534
## 1 1 1960 1960-01-01 0.1233145 0.1095332 0.1522818 0.2389952 0.2389952
## 2 2 1970 1970-01-01 0.1269715 0.1094000 0.1495096 0.2187031 0.2187031
## 3 3 1980 1980-01-01 0.1991767 0.1617313 0.2236916 0.2881646 0.2881646
## 4 4 1990 1990-01-01 0.2968306 0.2777491 0.2780912 0.3612968 0.3656655
## 5 5 2000 2000-01-01 0.3450087 0.3316545 0.3249205 0.3874906 0.3939579
## 6 6 2001 2001-01-01 0.3527767 0.3446069 0.3341101 0.3835686 0.3925148
## GD_2534 White_2534 Black_2534 Hisp_2534 NE_2534 MA_2534
## 1 NA 0.1164848 0.1621855 0.1393736 0.1504184 0.1628934
## 2 NA 0.1179043 0.1855163 0.1298769 0.1517231 0.1640680
## 3 NA 0.1824126 0.3137500 0.1885440 0.2414327 0.2505925
## 4 0.3474505 0.2639256 0.4838556 0.2962372 0.3500384 0.3623321
## 5 0.3691740 0.3127149 0.5144994 0.3180681 0.4091852 0.4175565
## 6 0.3590304 0.3183506 0.5437985 0.3321214 0.4200581 0.4294281
## Midwest_2534 South_2534 Mountain_2534 Pacific_2534 poor_2534 mid_2534
## 1 0.1121467 0.1090562 0.09152117 0.1198758 0.1371597 0.07514929
## 2 0.1153741 0.1126220 0.10293602 0.1374964 0.1717202 0.08159207
## 3 0.1828339 0.1688435 0.17434230 0.2334279 0.3100591 0.14825303
## 4 0.2755046 0.2639794 0.25264326 0.3319579 0.4199108 0.24320008
## 5 0.3308022 0.3099712 0.30621032 0.3753061 0.5033676 0.30202036
## 6 0.3344332 0.3182688 0.30980779 0.3844799 0.5178771 0.31716118
## rich_2534 all_3544 HS_3544 SC_3544 BAp_3544 BAo_3544 GD_3544
## 1 0.2066776 0.07058157 0.06860309 0.06663695 0.1326265 0.1326265 NA
## 2 0.1724093 0.06732520 0.06511964 0.06271724 0.1116899 0.1116899 NA
## 3 0.1851082 0.06883378 0.06429102 0.06531333 0.1056102 0.1056102 NA
## 4 0.2783226 0.11191800 0.11210043 0.09699372 0.1285172 0.1258567 0.1328018
## 5 0.2717386 0.15605881 0.16993703 0.13800404 0.1541238 0.1536299 0.1550970
## 6 0.2532041 0.15642529 0.16870156 0.13986044 0.1548151 0.1524923 0.1595169
## White_3544 Black_3544 Hisp_3544 NE_3544 MA_3544 Midwest_3544
## 1 0.06825586 0.08836728 0.07307651 0.09194322 0.09347468 0.06863360
## 2 0.06250372 0.10290904 0.07070500 0.08570110 0.09040725 0.06156272
## 3 0.05966739 0.13140081 0.08110790 0.07997323 0.09744428 0.06070641
## 4 0.09611312 0.22010298 0.12194206 0.12785915 0.14354989 0.10157576
## 5 0.13207032 0.30239381 0.15469520 0.17327422 0.18819256 0.14539201
## 6 0.13287455 0.30857796 0.14953050 0.16653497 0.18315109 0.14794407
## South_3544 Mountain_3544 Pacific_3544 poor_3544 mid_3544 rich_3544
## 1 0.06026353 0.04739747 0.05822486 0.1019749 0.04717272 0.08553870
## 2 0.05966057 0.04651163 0.06347796 0.1117548 0.04566838 0.06499159
## 3 0.05914089 0.04880077 0.07552538 0.1291426 0.05050321 0.04445951
## 4 0.09637035 0.09189904 0.13134638 0.2012208 0.09024739 0.06573916
## 5 0.14230600 0.13584194 0.17480047 0.2813137 0.12815751 0.08622046
## 6 0.14312592 0.13943820 0.17694864 0.2919112 0.13267625 0.06803283
## all_4554 HS_4554 SC_4554 BAp_4554 BAo_4554 GD_4554
## 1 0.07254649 0.06840792 0.07903755 0.15360889 0.15360889 NA
## 2 0.05968794 0.05833439 0.05443478 0.10466047 0.10466047 NA
## 3 0.05250871 0.05036563 0.04816180 0.08623774 0.08623774 NA
## 4 0.05947824 0.05988244 0.04654087 0.07301884 0.06416529 0.08394886
## 5 0.08804394 0.09442809 0.07558786 0.09208417 0.09097472 0.09362802
## 6 0.08823342 0.09189007 0.07795481 0.09333365 0.09313480 0.09362876
## White_4554 Black_4554 Hisp_4554 NE_4554 MA_4554 Midwest_4554
## 1 0.07246692 0.06913249 0.06636058 0.10236412 0.09264788 0.07285321
## 2 0.05754799 0.07899168 0.05810740 0.08028082 0.07860635 0.05791163
## 3 0.04765354 0.08624602 0.06522951 0.06930253 0.07508466 0.04807290
## 4 0.05092552 0.11617699 0.07613556 0.07047502 0.08373134 0.05398391
## 5 0.07578174 0.17587334 0.09418009 0.10232170 0.11269659 0.08302437
## 6 0.07516912 0.18154531 0.09409896 0.09868408 0.10953635 0.08207629
## South_4554 Mountain_4554 Pacific_4554 poor_4554 mid_4554 rich_4554
## 1 0.05977295 0.04754183 0.05996993 0.1030055 0.05364421 0.07908591
## 2 0.05174462 0.03970134 0.04826312 0.1016489 0.04221637 0.05142867
## 3 0.04485348 0.03374438 0.04958992 0.1003011 0.03830266 0.03311296
## 4 0.05043636 0.04459411 0.06461875 0.1148335 0.04562332 0.03136386
## 5 0.07631858 0.07637774 0.09896832 0.1718976 0.07055672 0.03897342
## 6 0.07886513 0.07405971 0.10119511 0.1759369 0.07407508 0.02857320
## nokids_all_2534 kids_all_2534 nokids_HS_2534 nokids_SC_2534
## 1 0.4640564 0.002820625 0.4430148 0.5000402
## 2 0.4309043 0.009868596 0.4246779 0.4333479
## 3 0.4464304 0.025285667 0.4319342 0.4505900
## 4 0.5425242 0.060277451 0.5464881 0.5238446
## 5 0.5714531 0.099472713 0.5711395 0.5700042
## 6 0.5852213 0.110178467 0.6045475 0.5810912
## nokids_BAp_2534 nokids_BAo_2534 nokids_GD_2534 kids_HS_2534 kids_SC_2534
## 1 0.5619099 0.5619099 NA 0.003318886 0.001150824
## 2 0.4554766 0.4554766 NA 0.012465915 0.003699982
## 3 0.4719700 0.4719700 NA 0.031930752 0.018135401
## 4 0.5560765 0.5633301 0.5332628 0.078470444 0.052032702
## 5 0.5729677 0.5862213 0.5367160 0.127193577 0.097625310
## 6 0.5698644 0.5864967 0.5258800 0.141395652 0.110030662
## kids_BAp_2534 kids_BAo_2534 kids_GD_2534 nokids_poor_2534
## 1 0.0005751073 0.0005751073 NA 0.4933061
## 2 0.0014683425 0.0014683425 NA 0.5097742
## 3 0.0062544364 0.0062544364 NA 0.5740402
## 4 0.0171241042 0.0181766027 0.01374234 0.6546908
## 5 0.0370024452 0.0401009875 0.02761467 0.7055451
## 6 0.0399801447 0.0445838012 0.02645041 0.7147334
## nokids_mid_2534 nokids_rich_2534 kids_poor_2534 kids_mid_2534
## 1 0.4100080 0.4921184 0.008722711 0.0007532065
## 2 0.3764538 0.4288948 0.029974945 0.0033771145
## 3 0.3998250 0.3848089 0.077926214 0.0102368871
## 4 0.5186604 0.4750156 0.170763774 0.0274655254
## 5 0.5690228 0.4458023 0.256281918 0.0597845173
## 6 0.5864741 0.4461111 0.280146488 0.0677954572
## kids_rich_2534
## 1 0.0008027331
## 2 0.0030435661
## 3 0.0068317224
## 4 0.0182329127
## 5 0.0295644698
## 6 0.0336540502
dim(marriage)
## [1] 17 75
colnames(marriage)
## [1] "X" "year" "date"
## [4] "all_2534" "HS_2534" "SC_2534"
## [7] "BAp_2534" "BAo_2534" "GD_2534"
## [10] "White_2534" "Black_2534" "Hisp_2534"
## [13] "NE_2534" "MA_2534" "Midwest_2534"
## [16] "South_2534" "Mountain_2534" "Pacific_2534"
## [19] "poor_2534" "mid_2534" "rich_2534"
## [22] "all_3544" "HS_3544" "SC_3544"
## [25] "BAp_3544" "BAo_3544" "GD_3544"
## [28] "White_3544" "Black_3544" "Hisp_3544"
## [31] "NE_3544" "MA_3544" "Midwest_3544"
## [34] "South_3544" "Mountain_3544" "Pacific_3544"
## [37] "poor_3544" "mid_3544" "rich_3544"
## [40] "all_4554" "HS_4554" "SC_4554"
## [43] "BAp_4554" "BAo_4554" "GD_4554"
## [46] "White_4554" "Black_4554" "Hisp_4554"
## [49] "NE_4554" "MA_4554" "Midwest_4554"
## [52] "South_4554" "Mountain_4554" "Pacific_4554"
## [55] "poor_4554" "mid_4554" "rich_4554"
## [58] "nokids_all_2534" "kids_all_2534" "nokids_HS_2534"
## [61] "nokids_SC_2534" "nokids_BAp_2534" "nokids_BAo_2534"
## [64] "nokids_GD_2534" "kids_HS_2534" "kids_SC_2534"
## [67] "kids_BAp_2534" "kids_BAo_2534" "kids_GD_2534"
## [70] "nokids_poor_2534" "nokids_mid_2534" "nokids_rich_2534"
## [73] "kids_poor_2534" "kids_mid_2534" "kids_rich_2534"
marriage_long <- marriage %>% gather(group, percentage, colnames(marriage)[4]:colnames(marriage)[75]) %>% select(-X)
head(marriage_long)
## year date group percentage
## 1 1960 1960-01-01 all_2534 0.1233145
## 2 1970 1970-01-01 all_2534 0.1269715
## 3 1980 1980-01-01 all_2534 0.1991767
## 4 1990 1990-01-01 all_2534 0.2968306
## 5 2000 2000-01-01 all_2534 0.3450087
## 6 2001 2001-01-01 all_2534 0.3527767
marriage_long <- marriage_long %>% mutate(group = str_replace(group, "kids_", "kids ")) %>% separate(group, c("category", "age"), sep = "\\_")
head(marriage_long)
## year date category age percentage
## 1 1960 1960-01-01 all 2534 0.1233145
## 2 1970 1970-01-01 all 2534 0.1269715
## 3 1980 1980-01-01 all 2534 0.1991767
## 4 1990 1990-01-01 all 2534 0.2968306
## 5 2000 2000-01-01 all 2534 0.3450087
## 6 2001 2001-01-01 all 2534 0.3527767
levels(factor(marriage_long$age))
## [1] "2534" "3544" "4554"
marriage_long <- marriage_long %>% mutate(age = str_replace(age, "25", "25-")) %>% mutate(age = str_replace(age, "35", "35-")) %>% mutate(age = str_replace(age, "45", "45-"))
head(marriage_long)
## year date category age percentage
## 1 1960 1960-01-01 all 25-34 0.1233145
## 2 1970 1970-01-01 all 25-34 0.1269715
## 3 1980 1980-01-01 all 25-34 0.1991767
## 4 1990 1990-01-01 all 25-34 0.2968306
## 5 2000 2000-01-01 all 25-34 0.3450087
## 6 2001 2001-01-01 all 25-34 0.3527767
group_by(marriage_long,category) %>% summarise(mean=mean(percentage, na.rm = TRUE), median=median(percentage, na.rm = TRUE), sd=sd(percentage, na.rm = TRUE))
## # A tibble: 36 x 4
## category mean median sd
## <chr> <dbl> <dbl> <dbl>
## 1 all 0.20702941 0.15967630 0.13260768
## 2 BAo 0.21935146 0.15787306 0.13722435
## 3 BAp 0.21472066 0.15635891 0.13027962
## 4 Black 0.34518965 0.30969793 0.18187250
## 5 GD 0.21245415 0.15693960 0.11891211
## 6 Hisp 0.20597339 0.15469520 0.12520360
## 7 HS 0.21886352 0.16993703 0.13706999
## 8 kids all 0.11367162 0.12388103 0.05745119
## 9 kids BAo 0.04751170 0.05198276 0.02644573
## 10 kids BAp 0.04200286 0.04656597 0.02274633
## # ... with 26 more rows
group_by(marriage_long,age) %>% summarise(mean=mean(percentage, na.rm = TRUE), median=median(percentage, na.rm = TRUE), sd=sd(percentage, na.rm = TRUE))
## # A tibble: 3 x 4
## age mean median sd
## <chr> <dbl> <dbl> <dbl>
## 1 25-34 0.3615562 0.3867121 0.20797066
## 2 35-44 0.1681077 0.1638645 0.07191929
## 3 45-54 0.1106611 0.1046605 0.04907697
group_by(marriage_long,category) %>% summarise(max=max(percentage, na.rm = TRUE), min=min(percentage, na.rm = TRUE))
## # A tibble: 36 x 3
## category max min
## <chr> <dbl> <dbl>
## 1 all 0.49434533 0.0525087145
## 2 BAo 0.50226213 0.0641652869
## 3 BAp 0.47663568 0.0730188438
## 4 Black 0.68470875 0.0691324901
## 5 GD 0.41645825 0.0839488589
## 6 Hisp 0.48740313 0.0581074016
## 7 HS 0.52352118 0.0503656291
## 8 kids all 0.18361790 0.0028206251
## 9 kids BAo 0.08304373 0.0005751073
## 10 kids BAp 0.07100196 0.0005751073
## # ... with 26 more rows
group_by(marriage_long,age) %>% summarise(max=max(percentage, na.rm = TRUE), min=min(percentage, na.rm = TRUE))
## # A tibble: 3 x 3
## age max min
## <chr> <dbl> <dbl>
## 1 25-34 0.8093071 0.0005751073
## 2 35-44 0.4044349 0.0444595055
## 3 45-54 0.2921425 0.0285731986