library("tidyr")
library("dplyr")
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 3.4.2
The first data set is shark attacks, it list the attack happened in different conntry and different location from 1543 to 2017.For each case it include what type of activities the person was doing during attack, person’s gender,age and what kind of injury the person end up with as well as fatalness of the injury.
The question we are trying to ansewr are the folowing: 1 in last two years, which location in each country has most case(highest frequency location) 2 in last two years what activities cause most fatal injury case in each country(most dangerous activities) 3 in each country, what’s the percentage Of fatal injury case belong to female
table<- read.csv('attacks.csv',sep=',',header=TRUE)
View(table)
tbl_df(table)%>%
mutate_if(is.factor,as.character)%>%
select(Case.Number,Date,Year,Country,Location,Activity,Sex,Age,Injury,Fatal..Y.N.)%>%
filter(Location!='',Country!='')%>%
group_by(Year,Country,Location)%>%
summarise(n=n())%>%
mutate(r=row_number(desc(n)))%>%
filter(r==1,Year==2016||Year==2017 )%>%
arrange(desc(Year),desc(n))%>%
select(-r)
## # A tibble: 30 x 4
## # Groups: Year, Country [30]
## Year Country
## <int> <chr>
## 1 2017 USA
## 2 2017 AUSTRALIA
## 3 2017 BAHAMAS
## 4 2017 BRAZIL
## 5 2017 INDONESIA
## 6 2017 MEXICO
## 7 2017 NEW ZEALAND
## 8 2017 REUNION
## 9 2017 SOUTH AFRICA
## 10 2017 ST HELENA, British overseas territory
## # ... with 20 more rows, and 2 more variables: Location <chr>, n <int>
From the result, we see that in most country there is only one case in each location, however Melbourne Beachm,USA has 3 cases in 2017 and New Smyrna Beach,USA has 13 cases in 2016. So people should be careful when spending holiday in these two place.
tbl_df(table)%>%
mutate_if(is.factor,as.character)%>%
select(Case.Number,Date,Year,Country,Location,Activity,Sex,Age,Injury,Fatal..Y.N.)%>%
filter(Year==2016| Year==2017)%>%
group_by(Year,Country,Activity)%>%
summarise(n=n())%>%
mutate(r=row_number(desc(n)))%>%
filter(r==1)%>%
arrange(desc(Year),desc(n))%>%
select(-r)
## # A tibble: 34 x 4
## # Groups: Year, Country [34]
## Year Country Activity
## <int> <chr> <chr>
## 1 2017 USA Surfing
## 2 2017 AUSTRALIA Surfing
## 3 2017 REUNION Body boarding
## 4 2017 BAHAMAS Scuba Diving
## 5 2017 BRAZIL Grabbing shark for a selfie
## 6 2017 INDONESIA Surfing
## 7 2017 MEXICO Snorkeling
## 8 2017 NEW ZEALAND Body boarding
## 9 2017 SOUTH AFRICA Kayak fishing
## 10 2017 ST HELENA, British overseas territory Snorkeling
## # ... with 24 more rows, and 1 more variables: n <int>
Sufing seems to be the most dangerous activities in a lot of countries especally in USA and AUSTRALIA, and Swimming also case some fatal injury cases in mutiple countries
tbl_df(table)%>%
mutate_if(is.factor,as.character)%>%
select(Case.Number,Date,Year,Country,Sex,Fatal..Y.N.)%>%
filter(Fatal..Y.N.=='Y',Country!='',Sex !='')%>%
group_by(Country,Sex)%>%
summarise(n=n())%>%
spread(Sex,n)%>%
replace(is.na(.),0)%>%
mutate(PercentageOfFemale=round((F/(F+M))*100))%>%
select(-.)%>%
arrange(desc(PercentageOfFemale))
## # A tibble: 136 x 4
## # Groups: Country [136]
## Country F M PercentageOfFemale
## <chr> <dbl> <dbl> <dbl>
## 1 GUATEMALA 1 0 100
## 2 KOREA 1 0 100
## 3 MALAYSIA 1 0 100
## 4 MALDIVE ISLANDS 1 0 100
## 5 YEMEN 1 0 100
## 6 BURMA 2 1 67
## 7 BERMUDA 1 1 50
## 8 CAPE VERDE 1 1 50
## 9 CROATIA 7 7 50
## 10 FRENCH POLYNESIA 1 1 50
## # ... with 126 more rows
In general, Male has more chance to get fatal injury than female, maybe it’s because male tend to do more dangerous activities, however in CROATIA, male and female both have 7 cases.
The second dataset list the number of people use different drugs in the past 12 months in different age group and the number of times a user in an age group used alcohol in the past 12 month
We want to know two things 1 what durg does a user use most in each age group 2 what is the most widely used drug in each age group
table<-read.csv("drug.csv",head=TRUE,sep=',')
View(table)
tbl_df(table)%>%
gather(drug,frequency,contains("frequency"))%>%
mutate(drug=sub(".frequency",'',drug))%>%
arrange(age)%>%
group_by(age)%>%
mutate(r=row_number(desc(frequency)))%>%
filter(r==1)%>%
select(age,drug,frequency)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## # A tibble: 17 x 3
## # Groups: age [17]
## age drug frequency
## <fctr> <chr> <chr>
## 1 12 hallucinogen 52
## 2 13 alcohol 6
## 3 14 cocaine 5.5
## 4 15 crack 9.5
## 5 16 stimulant 9.5
## 6 17 pain.releiver 9
## 7 18 stimulant 8
## 8 19 oxycontin 7.5
## 9 20 cocaine 8.0
## 10 21 sedative 9
## 11 22-23 heroin 57.5
## 12 24-25 heroin 88.0
## 13 26-29 stimulant 7
## 14 30-34 cocaine 8.0
## 15 35-49 tranquilizer 6
## 16 50-64 crack 62.0
## 17 65+ alcohol 52
From the result we find some serious problems, first of all for those 12 years old children who use hallucinogen, they really used a lot in past 12 months,although only 2% of children within the age grou use hallucinogen. So those children really need to supervised by their parents. What even worse is that some yong adult from 22 to 25 years old take heroin a lot and look back to the oringal table there are relative high proportion of yong adult use heroin within these age group. in the table blew, we also find that easch yong adult in age group 22-23 and 24-25 are not only use heroin a lot, but also the proportion of heroin user are highest across all age groups. So we really should educate yong people more about how dangerous heroin is.
tbl_df(table)%>%
select(age,heroin.use)%>%
arrange(desc(heroin.use))
## # A tibble: 17 x 2
## age heroin.use
## <fctr> <dbl>
## 1 22-23 1.1
## 2 20 0.9
## 3 24-25 0.7
## 4 21 0.6
## 5 26-29 0.6
## 6 19 0.5
## 7 18 0.4
## 8 30-34 0.4
## 9 15 0.2
## 10 12 0.1
## 11 14 0.1
## 12 16 0.1
## 13 17 0.1
## 14 35-49 0.1
## 15 50-64 0.1
## 16 13 0.0
## 17 65+ 0.0
tbl_df(table)%>%
gather(drug,usecase,contains("use"))%>%
mutate(drug=sub(".use",'',drug))%>%
select(age,drug,n,usecase)%>%
mutate(number_of_people=n*(usecase/100))%>%
mutate(number_of_people=round(number_of_people))%>%
group_by(age)%>%
mutate(r=row_number(desc(number_of_people)))%>%
filter(r==1)%>%
select(age,drug,number_of_people)
## # A tibble: 17 x 3
## # Groups: age [17]
## age drug number_of_people
## <fctr> <chr> <dbl>
## 1 12 alcohol 109
## 2 13 alcohol 234
## 3 14 alcohol 505
## 4 15 alcohol 863
## 5 16 alcohol 1226
## 6 17 alcohol 1498
## 7 18 alcohol 1449
## 8 19 alcohol 1436
## 9 20 alcohol 1583
## 10 21 alcohol 1959
## 11 22-23 alcohol 3963
## 12 24-25 alcohol 3815
## 13 26-29 alcohol 2121
## 14 30-34 alcohol 2220
## 15 35-49 alcohol 5543
## 16 50-64 alcohol 2636
## 17 65+ alcohol 1207
From the table above, we can see that alcohol are the most widely used drug regradless the age, which is expected, however what interesting is that, according to the law, people with age under 16 can not drink outside, which means some parents is not doing good job on alcohol regulation at home. Also it’s surprising that people start to drink at 12.
tbl_df(table)%>%
gather(drug,usecase,contains("use"))%>%
mutate(drug=sub(".use",'',drug))%>%
select(age,drug,n,usecase)%>%
mutate(number_of_people=n*(usecase/100))%>%
mutate(number_of_people=round(number_of_people))%>%
group_by(age)%>%
top_n(5,number_of_people)%>%
select(age,drug,number_of_people)%>%
arrange(age,desc(number_of_people))
## # A tibble: 89 x 3
## # Groups: age [17]
## age drug number_of_people
## <fctr> <chr> <dbl>
## 1 12 alcohol 109
## 2 12 pain.releiver 56
## 3 12 inhalant 45
## 4 12 marijuana 31
## 5 12 hallucinogen 6
## 6 12 tranquilizer 6
## 7 12 stimulant 6
## 8 12 sedative 6
## 9 13 alcohol 234
## 10 13 marijuana 94
## # ... with 79 more rows
It’s interesting that pain releiver ranked very top accross all age group and seems like we all have pain problems, in addition,marijuana is also mentioned accross all age group which is also a pain releiver
The dataset contain marriage rate for different group of people from 1960 to 2012 group include different education level like high school, some college, undergraduate, and gradute. It also include different race and so on. The question we want to know is the trend of marriage rate for different education level.
table <- read.csv('marriage_rate.csv', sep =',')
View(table)
tbl_df(table)%>%
select(year,all_2534,HS_2534,SC_2534,BAp_2534,BAo_2534,GD_2534)%>%
replace(is.na(.),0)%>%
mutate(all_change=((all_2534-lag(all_2534))/lag(all_2534))*100,
HS_change=((HS_2534-lag(HS_2534))/lag(HS_2534))*100,
SC_change=((SC_2534-lag(SC_2534))/lag(SC_2534))*100,
BAp_change=((BAp_2534-lag(BAp_2534))/lag(BAp_2534))*100,
BAo_change=((BAo_2534-lag(BAo_2534))/lag(BAo_2534))*100,
GD_change=((GD_2534-lag(GD_2534))/lag(GD_2534))*100)%>%
replace(is.na(.),0)%>%
mutate(all_change=round(all_change,digits=2),
HS_change=round(HS_change,digits=2),
SC_change=round(SC_change,digits=2),
BAp_change=round(BAp_change,digits=2),
BAo_change=round(BAo_change,digits=2),
GD_change=round(GD_change,digits=2))%>%
select(year,all_2534,all_change,HS_2534,HS_change,SC_2534,SC_change,BAp_2534,BAp_change,BAo_2534,BAo_change,GD_2534,GD_change)
## # A tibble: 17 x 13
## year all_2534 all_change HS_2534 HS_change SC_2534 SC_change
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1960 0.1233145 0.00 0.1095332 0.00 0.1522818 0.00
## 2 1970 0.1269715 2.97 0.1094000 -0.12 0.1495096 -1.82
## 3 1980 0.1991767 56.87 0.1617313 47.83 0.2236916 49.62
## 4 1990 0.2968306 49.03 0.2777491 71.73 0.2780912 24.32
## 5 2000 0.3450087 16.23 0.3316545 19.41 0.3249205 16.84
## 6 2001 0.3527767 2.25 0.3446069 3.91 0.3341101 2.83
## 7 2002 0.3535249 0.21 0.3490367 1.29 0.3361595 0.61
## 8 2003 0.3620345 2.41 0.3581877 2.62 0.3418930 1.71
## 9 2004 0.3673247 1.46 0.3708102 3.52 0.3450748 0.93
## 10 2005 0.3793451 3.27 0.3870680 4.38 0.3596663 4.23
## 11 2006 0.4147656 9.34 0.4312162 11.41 0.3912177 8.77
## 12 2007 0.4269222 2.93 0.4441386 3.00 0.4084929 4.42
## 13 2008 0.4394414 2.93 0.4599162 3.55 0.4235094 3.68
## 14 2009 0.4625638 5.26 0.4845018 5.35 0.4469940 5.55
## 15 2010 0.4697332 1.55 0.4942221 2.01 0.4544084 1.66
## 16 2011 0.4833335 2.90 0.5115703 3.51 0.4685570 3.11
## 17 2012 0.4943453 2.28 0.5235212 2.34 0.4799344 2.43
## # ... with 6 more variables: BAp_2534 <dbl>, BAp_change <dbl>,
## # BAo_2534 <dbl>, BAo_change <dbl>, GD_2534 <dbl>, GD_change <dbl>
tbl_df(table)%>%
select(year,all_2534,HS_2534,SC_2534,BAp_2534,BAo_2534,GD_2534)%>%
replace(is.na(.),0)%>%
mutate(all_change=((all_2534-lag(all_2534))/lag(all_2534))*100,
HS_change=((HS_2534-lag(HS_2534))/lag(HS_2534))*100,
SC_change=((SC_2534-lag(SC_2534))/lag(SC_2534))*100,
BAp_change=((BAp_2534-lag(BAp_2534))/lag(BAp_2534))*100,
BAo_change=((BAo_2534-lag(BAo_2534))/lag(BAo_2534))*100,
GD_change=((GD_2534-lag(GD_2534))/lag(GD_2534))*100)%>%
replace(is.na(.),0)%>%
mutate(all_change=round(all_change,digits=2),
HS_change=round(HS_change,digits=2),
SC_change=round(SC_change,digits=2),
BAp_change=round(BAp_change,digits=2),
BAo_change=round(BAo_change,digits=2),
GD_change=round(GD_change,digits=2))%>%
select(year,all_2534,all_change,HS_2534,HS_change,SC_2534,SC_change,BAp_2534,BAp_change,BAo_2534,BAo_change,GD_2534,GD_change)%>%
ggplot( aes(x=year)) + geom_line(aes(y = HS_change, colour = "High School"))+ geom_line(aes(y = SC_change, colour = "Some College"))+
geom_line(aes(y = BAo_change, colour = "Bachelor"))+
geom_line(aes(y = GD_change, colour = "Graduate"))
marriage rate peaked at 1990 for people with graduate and high school education while it peaked at 1980 for people with bachelor and some college education. after 2000 all marriage rate decsrease. In addition, since 2000, people with higher education seems has lower marriage rate growth.