Install packages
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringi)
druguse<- read.csv("https://raw.githubusercontent.com/Lidiia25/Project-607-2/master/drug-use-by-age.csv")
head(druguse)
## age n alcohol.use alcohol.frequency marijuana.use marijuana.frequency
## 1 12 2798 3.9 3 1.1 4
## 2 13 2757 8.5 6 3.4 15
## 3 14 2792 18.1 5 8.7 24
## 4 15 2956 29.2 6 14.5 25
## 5 16 3058 40.1 10 22.5 30
## 6 17 3038 49.3 13 28.0 36
## cocaine.use cocaine.frequency crack.use crack.frequency heroin.use
## 1 0.1 5.0 0.0 - 0.1
## 2 0.1 1.0 0.0 3.0 0.0
## 3 0.1 5.5 0.0 - 0.1
## 4 0.5 4.0 0.1 9.5 0.2
## 5 1.0 7.0 0.0 1.0 0.1
## 6 2.0 5.0 0.1 21.0 0.1
## heroin.frequency hallucinogen.use hallucinogen.frequency inhalant.use
## 1 35.5 0.2 52 1.6
## 2 - 0.6 6 2.5
## 3 2.0 1.6 3 2.6
## 4 1.0 2.1 4 2.5
## 5 66.5 3.4 3 3.0
## 6 64.0 4.8 3 2.0
## inhalant.frequency pain.releiver.use pain.releiver.frequency
## 1 19.0 2.0 36
## 2 12.0 2.4 14
## 3 5.0 3.9 12
## 4 5.5 5.5 10
## 5 3.0 6.2 7
## 6 4.0 8.5 9
## oxycontin.use oxycontin.frequency tranquilizer.use
## 1 0.1 24.5 0.2
## 2 0.1 41.0 0.3
## 3 0.4 4.5 0.9
## 4 0.8 3.0 2.0
## 5 1.1 4.0 2.4
## 6 1.4 6.0 3.5
## tranquilizer.frequency stimulant.use stimulant.frequency meth.use
## 1 52.0 0.2 2.0 0.0
## 2 25.5 0.3 4.0 0.1
## 3 5.0 0.8 12.0 0.1
## 4 4.5 1.5 6.0 0.3
## 5 11.0 1.8 9.5 0.3
## 6 7.0 2.8 9.0 0.6
## meth.frequency sedative.use sedative.frequency
## 1 - 0.2 13.0
## 2 5.0 0.1 19.0
## 3 24.0 0.2 16.5
## 4 10.5 0.4 30.0
## 5 36.0 0.2 3.0
## 6 48.0 0.5 6.5
In this dataset I want to create colums for each drug and split use and percentage in two sepate columns.
Create separate dataframes for “use” and “frequency”
drug1<- select(druguse, age, n,contains("use") )
drug11<- gather(drug1, "Drug", "Percentage", 3:15)
drug11<- mutate(drug11, row_number())
head(drug11)
## age n Drug Percentage row_number()
## 1 12 2798 alcohol.use 3.9 1
## 2 13 2757 alcohol.use 8.5 2
## 3 14 2792 alcohol.use 18.1 3
## 4 15 2956 alcohol.use 29.2 4
## 5 16 3058 alcohol.use 40.1 5
## 6 17 3038 alcohol.use 49.3 6
drug2<- select(druguse, age, n,contains("freq") )
drug22<- gather(drug2, "Drug Frequency", "freq per year", 3:15)
## Warning: attributes are not identical across measure variables;
## they will be dropped
drug22<- mutate(drug22, row_number())
head(drug22)
## age n Drug Frequency freq per year row_number()
## 1 12 2798 alcohol.frequency 3 1
## 2 13 2757 alcohol.frequency 6 2
## 3 14 2792 alcohol.frequency 5 3
## 4 15 2956 alcohol.frequency 6 4
## 5 16 3058 alcohol.frequency 10 5
## 6 17 3038 alcohol.frequency 13 6
Join tables
druguse2<- inner_join(drug11, drug22, by = c("row_number()", "age", "n"))
head(druguse2)
## age n Drug Percentage row_number() Drug Frequency
## 1 12 2798 alcohol.use 3.9 1 alcohol.frequency
## 2 13 2757 alcohol.use 8.5 2 alcohol.frequency
## 3 14 2792 alcohol.use 18.1 3 alcohol.frequency
## 4 15 2956 alcohol.use 29.2 4 alcohol.frequency
## 5 16 3058 alcohol.use 40.1 5 alcohol.frequency
## 6 17 3038 alcohol.use 49.3 6 alcohol.frequency
## freq per year
## 1 3
## 2 6
## 3 5
## 4 6
## 5 10
## 6 13
Remove “.use” from Drug column
Drugs<- gsub(".use", "", druguse2$Drug)
Drugs<-data_frame(Drugs)
Drugs<- mutate(Drugs, row_number())
druguse3<- left_join(druguse2, Drugs, by = c("row_number()"))
drugusefinal <- select(druguse3, Drugs, age, n, Percentage,`freq per year`)
head(drugusefinal)
## Drugs age n Percentage freq per year
## 1 alcohol 12 2798 3.9 3
## 2 alcohol 13 2757 8.5 6
## 3 alcohol 14 2792 18.1 5
## 4 alcohol 15 2956 29.2 6
## 5 alcohol 16 3058 40.1 10
## 6 alcohol 17 3038 49.3 13
I will add a separate column where I can see number of those in an age group who used drug or alcohol in the past 12 months based on procentage.
drugusefinal2<- mutate(drugusefinal, "Count of who used drug"= n*`Percentage`/100)
head(drugusefinal2)
## Drugs age n Percentage freq per year Count of who used drug
## 1 alcohol 12 2798 3.9 3 109.122
## 2 alcohol 13 2757 8.5 6 234.345
## 3 alcohol 14 2792 18.1 5 505.352
## 4 alcohol 15 2956 29.2 6 863.152
## 5 alcohol 16 3058 40.1 10 1226.258
## 6 alcohol 17 3038 49.3 13 1497.734
Also I want to see average percentage for each drug
drugusefinal2%>%
group_by(Drugs)%>%
summarise("Avg Percentage" = mean(`Percentage`, na.rm = TRUE))
## # A tibble: 13 x 2
## Drugs `Avg Percentage`
## <chr> <dbl>
## 1 alcohol 55.4294118
## 2 cocaine 2.1764706
## 3 crack 0.2941176
## 4 hallucinogen 3.3941176
## 5 heroin 0.3529412
## 6 inhalant 1.3882353
## 7 marijuana 18.9235294
## 8 meth 0.3823529
## 9 oxycontin 0.9352941
## 10 pain.releiver 6.2705882
## 11 sedative 0.2823529
## 12 stimulant 1.9176471
## 13 tranquilizer 2.8058824
yoga<- read.csv("https://raw.githubusercontent.com/Lidiia25/Project-607-2/master/Yoga.csv")
head(yoga)
## X Alabama..us.al. Alaska..us.ak. Arizona..us.az. Arkansas..us.ar.
## 1 2004-01 20 23 21 24
## 2 2004-02 8 26 25 16
## 3 2004-03 10 26 22 26
## 4 2004-04 15 34 24 18
## 5 2004-05 15 14 24 11
## 6 2004-06 12 21 23 14
## California..us.ca. Colorado..us.co. Connecticut..us.ct. Delaware..us.de.
## 1 32 33 27 47
## 2 27 30 26 28
## 3 28 29 30 51
## 4 25 27 22 27
## 5 25 24 25 20
## 6 27 27 26 22
## District.of.Columbia..us.dc. Florida..us.fl. Georgia..us.ga.
## 1 32 21 21
## 2 36 17 20
## 3 29 17 20
## 4 29 19 15
## 5 24 20 16
## 6 26 18 18
## Hawaii..us.hi. Idaho..us.id. Illinois..us.il. Indiana..us.in.
## 1 36 21 25 24
## 2 24 22 23 14
## 3 36 21 25 17
## 4 30 18 17 14
## 5 39 14 17 9
## 6 44 17 20 18
## Iowa..us.ia. Kansas..us.ks. Kentucky..us.ky. Louisiana..us.la.
## 1 14 20 17 20
## 2 16 12 19 15
## 3 18 13 18 17
## 4 19 19 12 20
## 5 14 15 15 14
## 6 17 18 17 16
## Maine..us.me. Maryland..us.md. Massachusetts..us.ma. Michigan..us.mi.
## 1 29 26 41 19
## 2 29 23 33 18
## 3 26 22 32 17
## 4 21 17 31 15
## 5 25 21 27 15
## 6 22 23 29 17
## Minnesota..us.mn. Mississippi..us.ms. Missouri..us.mo. Montana..us.mt.
## 1 26 16 19 44
## 2 22 20 18 26
## 3 21 18 15 41
## 4 17 18 12 25
## 5 22 13 15 24
## 6 18 18 20 21
## Nebraska..us.ne. Nevada..us.nv. New.Hampshire..us.nh. New.Jersey..us.nj.
## 1 15 20 45 27
## 2 21 25 20 22
## 3 16 24 22 23
## 4 16 17 26 20
## 5 14 25 23 22
## 6 15 23 27 21
## New.Mexico..us.nm. New.York..us.ny. North.Carolina..us.nc.
## 1 33 35 23
## 2 25 28 22
## 3 26 29 20
## 4 18 26 18
## 5 21 28 16
## 6 32 28 22
## North.Dakota..us.nd. Ohio..us.oh. Oklahoma..us.ok. Oregon..us.or.
## 1 52 19 22 34
## 2 45 16 19 30
## 3 45 15 12 28
## 4 45 13 11 22
## 5 45 16 17 25
## 6 45 16 17 27
## Pennsylvania..us.pa. Rhode.Island..us.ri. South.Carolina..us.sc.
## 1 19 44 24
## 2 18 26 19
## 3 20 27 18
## 4 19 31 14
## 5 16 26 11
## 6 20 33 19
## South.Dakota..us.sd. Tennessee..us.tn. Texas..us.tx. Utah..us.ut.
## 1 25 21 24 26
## 2 22 18 16 20
## 3 21 16 17 10
## 4 21 21 16 20
## 5 28 15 17 19
## 6 28 16 17 16
## Vermont..us.vt. Virginia..us.va. Washington..us.wa.
## 1 42 22 30
## 2 39 16 29
## 3 41 19 27
## 4 37 17 25
## 5 31 18 25
## 6 34 22 27
## West.Virginia..us.wv. Wisconsin..us.wi. Wyoming..us.wy.
## 1 23 18 0
## 2 17 17 37
## 3 27 20 35
## 4 26 17 37
## 5 21 18 35
## 6 28 19 35
In this dataset I will split “X” column in 2 columns “Year” and “Month”. Also I will create column “State”.
yoga2<- gather(yoga, "state", "n", 2:52)
head(yoga2)
## X state n
## 1 2004-01 Alabama..us.al. 20
## 2 2004-02 Alabama..us.al. 8
## 3 2004-03 Alabama..us.al. 10
## 4 2004-04 Alabama..us.al. 15
## 5 2004-05 Alabama..us.al. 15
## 6 2004-06 Alabama..us.al. 12
Separate “X” column in 2 columns.
yoga3<- separate(yoga2, X, c("y", "m"))
Clean data in State column
yoga4<- mutate(yoga3, state= gsub("..us....", "", yoga3$state))
head(yoga4)
## y m state n
## 1 2004 01 Alabama 20
## 2 2004 02 Alabama 8
## 3 2004 03 Alabama 10
## 4 2004 04 Alabama 15
## 5 2004 05 Alabama 15
## 6 2004 06 Alabama 12
I want to know when people were most interested in yoga in the past 10 years.
yoga4%>%
group_by(y)%>%
summarise(n = sum(n))%>%
arrange(desc(n))
## # A tibble: 13 x 2
## y n
## <chr> <int>
## 1 2015 18475
## 2 2014 17442
## 3 2013 16350
## 4 2012 14875
## 5 2010 14874
## 6 2011 14197
## 7 2004 13467
## 8 2005 13173
## 9 2006 12952
## 10 2009 12671
## 11 2007 12353
## 12 2008 12085
## 13 2016 6531
Which states are showing the most interest in yoga.
yoga4%>%
group_by(state)%>%
summarise(n = sum(n))%>%
arrange(desc(n))
## # A tibble: 51 x 2
## state n
## <chr> <int>
## 1 Vermont 7529
## 2 Hawaii 5699
## 3 Rhode.Island 5546
## 4 New.Hampshire 5292
## 5 Maine 5251
## 6 Montana 4976
## 7 Wyoming 4815
## 8 Massa 4763
## 9 Alaska 4726
## 10 Colorado 4578
## # ... with 41 more rows
catfish<- read.csv("https://raw.githubusercontent.com/Lidiia25/Project-607-2/master/Catfish.csv")
head(catfish)
## Size.category X1992 X1993 X1994 X1995 X1996 X1997 X1998
## 1 Broodfish 1/ 1491 1169 1183 1301 1171 1163 1187
## 2 Fingerling/fry 2/ 849412 669491 648628 724693 823397 873457 975542
## 3 Stockers 3/ 634353 571254 548207 554342 627834 754816 607878
## 4 Small foodsize 4/ 166731 153600 134314 138160 156297 178448 178511
## 5 Medium foodsize 5/ 70495 61894 48851 59159 64858 84725 62140
## 6 Large foodsize 6/ 6769 6698 5196 4536 6644 7810 7295
## X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007
## 1 1155 1377 1327 1171 1303 1113 1053 1091 886
## 2 986368 1053300 1023533 1066400 990163 745849 712144 1045266 985620
## 3 678682 790683 845287 676378 775226 890275 660000 781958 586320
## 4 182251 200032 239655 287591 254920 261323 243090 214848 210340
## 5 63049 77149 87926 106117 127908 109120 95240 103591 104080
## 6 9266 5812 6872 10746 11195 10947 10642 10823 8986
## X2008 X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1 801 704 536 495 562 540 650 577 520
## 2 951910 728340 429590 568990 451100 398510 420060 449510 328570
## 3 688844 586069 366090 380660 463485 339260 289080 248790 204800
## 4 204750 193870 169030 115560 112970 103520 102190 96810 100850
## 5 107800 105610 91790 54130 64740 58015 50600 48220 45775
## 6 9290 9316 8570 6212 3595 5155 4500 5090 3520
In this dataset I will create columns for each size and column “Year”
catfish2<- gather(catfish, "Year", "n", 2:26)
catfish2<- mutate(catfish2, Year= gsub("X", "", catfish2$Year))
head(catfish2)
## Size.category Year n
## 1 Broodfish 1/ 1992 1491
## 2 Fingerling/fry 2/ 1992 849412
## 3 Stockers 3/ 1992 634353
## 4 Small foodsize 4/ 1992 166731
## 5 Medium foodsize 5/ 1992 70495
## 6 Large foodsize 6/ 1992 6769
Create and rename new columns based on fish size
catfish3<- spread(catfish2, Size.category,n)
names(catfish3) <- c("Year","bloodfish", "fingerling", "large_foodsize", "medium_foodsize", "small_foodsize", "stockers")
head(catfish3)
## Year bloodfish fingerling large_foodsize medium_foodsize small_foodsize
## 1 1992 1491 849412 6769 70495 166731
## 2 1993 1169 669491 6698 61894 153600
## 3 1994 1183 648628 5196 48851 134314
## 4 1995 1301 724693 4536 59159 138160
## 5 1996 1171 823397 6644 64858 156297
## 6 1997 1163 873457 7810 84725 178448
## stockers
## 1 634353
## 2 571254
## 3 548207
## 4 554342
## 5 627834
## 6 754816
I want to calculate total ammount of catfish by year and change compare to previous year.
catfish3%>%
group_by(Year)%>%
summarise(n = sum(bloodfish,fingerling, large_foodsize, medium_foodsize, small_foodsize, stockers)) %>%
mutate(change = n -lag(n))
## # A tibble: 25 x 3
## Year n change
## <chr> <int> <int>
## 1 1992 1729251 NA
## 2 1993 1464106 -265145
## 3 1994 1386379 -77727
## 4 1995 1482191 95812
## 5 1996 1680201 198010
## 6 1997 1900419 220218
## 7 1998 1832553 -67866
## 8 1999 1920771 88218
## 9 2000 2128353 207582
## 10 2001 2204600 76247
## # ... with 15 more rows