library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
1.- Dataset (New York City Leading Causes of Death)
data_nyc_deaths <- read.csv("New_York_City_Leading_Causes_of_Death.csv",header = TRUE, stringsAsFactors = FALSE)
head(data_nyc_deaths)
## Year Ethnicity Sex Cause.of.Death
## 1 2010 NON-HISPANIC BLACK MALE HUMAN IMMUNODEFICIENCY VIRUS DISEASE
## 2 2010 NON-HISPANIC BLACK MALE INFLUENZA AND PNEUMONIA
## 3 2010 NON-HISPANIC BLACK MALE INTENTIONAL SELF-HARM (SUICIDE)
## 4 2010 NON-HISPANIC BLACK MALE MALIGNANT NEOPLASMS
## 5 2010 NON-HISPANIC BLACK MALE MENTAL DISORDERS DUE TO USE OF ALCOHOL
## 6 2010 NON-HISPANIC BLACK MALE NEPHRITIS, NEPHROTIC SYNDROME AND NEPHROSIS
## Count Percent
## 1 297 5
## 2 201 3
## 3 64 1
## 4 1540 23
## 5 50 1
## 6 70 1
gender<- data_nyc_deaths %>%
distinct %>%
group_by(Year, Sex, Cause.of.Death) %>%
tally(Count, sort=TRUE) %>%
top_n(1, n) %>%
arrange(Year)
gender
## # A tibble: 10 x 4
## # Groups: Year, Sex [10]
## Year Sex Cause.of.Death n
## <int> <chr> <chr> <int>
## 1 2007 FEMALE DISEASES OF HEART 11618
## 2 2007 MALE DISEASES OF HEART 9577
## 3 2008 FEMALE DISEASES OF HEART 11462
## 4 2008 MALE DISEASES OF HEART 9456
## 5 2009 FEMALE DISEASES OF HEART 10630
## 6 2009 MALE DISEASES OF HEART 9173
## 7 2010 FEMALE DISEASES OF HEART 9343
## 8 2010 MALE DISEASES OF HEART 8344
## 9 2011 FEMALE DISEASES OF HEART 9009
## 10 2011 MALE DISEASES OF HEART 7713
ethnic<- data_nyc_deaths %>%
distinct %>%
group_by(Year, Ethnicity, Cause.of.Death) %>%
tally(Count, sort=TRUE) %>%
top_n(1, n)
ethnic
## # A tibble: 20 x 4
## # Groups: Year, Ethnicity [20]
## Year Ethnicity Cause.of.Death n
## <int> <chr> <chr> <int>
## 1 2007 NON-HISPANIC WHITE DISEASES OF HEART 12682
## 2 2008 NON-HISPANIC WHITE DISEASES OF HEART 12339
## 3 2009 NON-HISPANIC WHITE DISEASES OF HEART 11465
## 4 2010 NON-HISPANIC WHITE DISEASES OF HEART 9846
## 5 2011 NON-HISPANIC WHITE DISEASES OF HEART 9236
## 6 2007 NON-HISPANIC BLACK DISEASES OF HEART 4843
## 7 2008 NON-HISPANIC BLACK DISEASES OF HEART 4802
## 8 2009 NON-HISPANIC BLACK DISEASES OF HEART 4603
## 9 2010 NON-HISPANIC BLACK DISEASES OF HEART 4297
## 10 2011 NON-HISPANIC BLACK DISEASES OF HEART 4083
## 11 2008 HISPANIC DISEASES OF HEART 2775
## 12 2007 HISPANIC DISEASES OF HEART 2745
## 13 2009 HISPANIC DISEASES OF HEART 2731
## 14 2010 HISPANIC DISEASES OF HEART 2671
## 15 2011 HISPANIC DISEASES OF HEART 2549
## 16 2009 ASIAN & PACIFIC ISLANDER DISEASES OF HEART 1004
## 17 2011 ASIAN & PACIFIC ISLANDER MALIGNANT NEOPLASMS 1004
## 18 2008 ASIAN & PACIFIC ISLANDER DISEASES OF HEART 1002
## 19 2010 ASIAN & PACIFIC ISLANDER MALIGNANT NEOPLASMS 943
## 20 2007 ASIAN & PACIFIC ISLANDER DISEASES OF HEART 925
cause<- data_nyc_deaths %>%
distinct %>%
filter(Year == 2010 | Year == 2011) %>%
group_by(Year, Cause.of.Death) %>%
tally(Count) %>%
spread(Year, n) %>%
mutate(years= ((`2011` - `2010`) / `2010`) * 100)
maj_death <- arrange(cause, desc(years))
min_death <- arrange(cause, years)
head(maj_death)
## # A tibble: 6 x 4
## Cause.of.Death `2010` `2011` years
## <chr> <int> <int> <dbl>
## 1 TUBERCULOSIS 5 10 100
## 2 ANEMIAS 44 67 52.3
## 3 PEPTIC ULCER 78 94 20.5
## 4 PSYCH. SUBSTANCE USE & ACCIDENTAL DRUG POISONING 648 748 15.4
## 5 MENTAL DISORDERS DUE TO USE OF ALCOHOL 178 200 12.4
## 6 CEREBROVASCULAR DISEASE 1568 1740 11.0
head(min_death)
## # A tibble: 6 x 4
## Cause.of.Death `2010` `2011` years
## <chr> <int> <int> <dbl>
## 1 CHOLELITHIASIS AND DISORDERS OF GALLBLADDER 35 21 -40
## 2 SHORT GESTATION/LBW 139 92 -33.8
## 3 ATHEROSCLEROSIS 197 143 -27.4
## 4 CARDIOVASCULAR DISORDERS IN PERINATAL PERIOD 29 24 -17.2
## 5 PREGNANCY, CHILDBIRTH AND THE PUERPERIUM 29 25 -13.8
## 6 HUMAN IMMUNODEFICIENCY VIRUS DISEASE 817 747 -8.57
2.- Dataset (Shark Attack)
shark_data<- "https://raw.githubusercontent.com/jgarcia71/Data-607-Assignments/master/Shark%20attacks.csv"
shark.data <- read.csv(shark_data, stringsAsFactors = F)
str(shark.data)
## 'data.frame': 25614 obs. of 22 variables:
## $ Case.Number : chr "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
## $ Date : chr "11-Jun-17" "10-Jun-17" "10-Jun-17" "Reported 07-Jun-2017" ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ Type : chr "Unprovoked" "Unprovoked" "Unprovoked" "Unprovoked" ...
## $ Country : chr "AUSTRALIA" "AUSTRALIA" "USA" "UNITED KINGDOM" ...
## $ Area : chr "Western Australia" "Victoria" "Florida" "South Devon" ...
## $ Location : chr "Point Casuarina, Bunbury" "Flinders, Mornington Penisula" "Ponce Inlet, Volusia County" "Bantham Beach" ...
## $ Activity : chr "Body boarding" "Surfing" "Surfing" "Surfing " ...
## $ Name : chr "Paul Goff" "female" "Bryan Brock" "Rich Thomson" ...
## $ Sex : chr "M" "F" "M" "M" ...
## $ Age : chr "48" "" "19" "30" ...
## $ Injury : chr "No injury, board bitten" "No injury, knocke off board" "Laceration to left foot" "Bruise to leg, cuts to hand sustained when he hit the shark" ...
## $ Fatal..Y.N. : chr "N" "N" "N" "N" ...
## $ Time : chr "08h30" "15h45" "10h00" "" ...
## $ Species : chr "White shark, 4 m" "7 gill shark" "" "3m shark, probably a smooth hound" ...
## $ Investigator.or.Source: chr "WA Today, 6/11/2017" "" "Daytona Beach News-Journal, 6/10/2017" "C. Moore, GSAF" ...
## $ pdf : chr "2017.06.11-Goff.pdf" "2017.06.10.b-Flinders.pdf" "2017.06.10.a-Brock.pdf" "2017.06.07.R-Thomson.pdf" ...
## $ href.formula : chr "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.11-Goff.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.b-Flinders.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.a-Brock.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.07.R-Thomson.pdf" ...
## $ href : chr "http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/201"| __truncated__ "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.b-Flinders.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.a-Brock.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.07.R-Thomson.pdf" ...
## $ Case.Number.1 : chr "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
## $ Case.Number.2 : chr "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
## $ original.order : int 6095 6094 6093 6092 6091 6090 6089 6088 6087 6086 ...
tail(shark.data)
## Case.Number Date Year Type Country Area Location Activity Name Sex
## 25609 NA
## 25610 NA
## 25611 NA
## 25612 NA
## 25613 NA
## 25614 xx NA
## Age Injury Fatal..Y.N. Time Species Investigator.or.Source pdf
## 25609
## 25610
## 25611
## 25612
## 25613
## 25614
## href.formula href Case.Number.1 Case.Number.2 original.order
## 25609 NA
## 25610 NA
## 25611 NA
## 25612 NA
## 25613 NA
## 25614 NA
shark.data <- shark.data %>%
filter(!(Case.Number %in% c("", NA, 0, "xx"))) %>%
select(-Case.Number.1) %>%
select(-Case.Number.2)
a <- shark.data %>%
filter(Year >= 1900 & Year < 2017) %>%
group_by(Year) %>%
summarise(Attacks = n())
plot(a, type = "l")
3.- Dataset (World Population)
World_Population <- read.csv("https://raw.githubusercontent.com/jgarcia71/Data-607-Assignments/master/World%20Population%20Data.csv", header=T, stringsAsFactors = F, skip=4)
View(World_Population)
WorldPopulation_selct <- select(World_Population, c(1,2,9,19,29,39,49,59))
names(WorldPopulation_selct )[1]<-paste("Country_Name")
names(WorldPopulation_selct )[2]<-paste("Country_Code")
WorldPopulation_selct_income <- filter(WorldPopulation_selct, grepl('income', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, grepl('Low|High|Middle|low|high|middle', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, !grepl('Middle East|OECD', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, !grepl('&', Country_Name))
WorldPopulation_selct_tidy <- WorldPopulation_selct_income %>% gather("Year", "Population", 3:8)
WorldPopulation_selct_tidy %>% arrange(Country_Name,Year,Population)
## Country_Name Country_Code Year
## 1 High income HIC X1964
## 2 High income HIC X1974
## 3 High income HIC X1984
## 4 High income HIC X1994
## 5 High income HIC X2004
## 6 High income HIC X2014
## 7 Low income LIC X1964
## 8 Low income LIC X1974
## 9 Low income LIC X1984
## 10 Low income LIC X1994
## 11 Low income LIC X2004
## 12 Low income LIC X2014
## 13 Lower middle income LMC X1964
## 14 Lower middle income LMC X1974
## 15 Lower middle income LMC X1984
## 16 Lower middle income LMC X1994
## 17 Lower middle income LMC X2004
## 18 Lower middle income LMC X2014
## 19 Middle income MIC X1964
## 20 Middle income MIC X1974
## 21 Middle income MIC X1984
## 22 Middle income MIC X1994
## 23 Middle income MIC X2004
## 24 Middle income MIC X2014
## 25 Sub-Saharan Africa (excluding high income) SSA X1964
## 26 Sub-Saharan Africa (excluding high income) SSA X1974
## 27 Sub-Saharan Africa (excluding high income) SSA X1984
## 28 Sub-Saharan Africa (excluding high income) SSA X1994
## 29 Sub-Saharan Africa (excluding high income) SSA X2004
## 30 Sub-Saharan Africa (excluding high income) SSA X2014
## 31 Upper middle income UMC X1964
## 32 Upper middle income UMC X1974
## 33 Upper middle income UMC X1984
## 34 Upper middle income UMC X1994
## 35 Upper middle income UMC X2004
## 36 Upper middle income UMC X2014
## Population
## 1 823154587
## 2 913510969
## 3 992295742
## 4 1069147445
## 5 1145971873
## 6 1227211897
## 7 182077594
## 8 233887209
## 9 300220347
## 10 395678396
## 11 519630935
## 12 678572076
## 13 1024253768
## 14 1292662974
## 15 1640549014
## 16 2042431452
## 17 2439908661
## 18 2847559084
## 19 2252227568
## 20 2845473103
## 21 3470527013
## 22 4163965335
## 23 4774222573
## 24 5365538848
## 25 251530081
## 26 325914096
## 27 432508714
## 28 571754398
## 29 745705643
## 30 978926559
## 31 1227973800
## 32 1552810129
## 33 1829977999
## 34 2121533883
## 35 2334313912
## 36 2517979764
ggplot(WorldPopulation_selct_tidy, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + geom_line(linetype="solid", color="blue", size=1) + geom_point(shape = 22, size = 3, fill = "yellow") + ggtitle("World Population") + labs(x="Income", y="Population")
WorldPopulation_selct_country <- filter(WorldPopulation_selct, !grepl('income', Country_Name))
WorldPopulation_selct_country <- filter(WorldPopulation_selct_country, !grepl('develop|debt|OECD|Fragile|classi|Other|World|Euro|Baltic', Country_Name))
WorldPopulation_selct_country <- filter(WorldPopulation_selct_country, !grepl('SAS|NAC', Country_Code))
WorldPopulation_selct_country <- arrange(WorldPopulation_selct_country, -X2014)
WorldPopulation_selct_count10 <- head(WorldPopulation_selct_country,10)
WorldPopulation_selct_count10_td <- WorldPopulation_selct_count10 %>% gather("Year", "Population", 3:8)
WorldPopulation_selct_count10_td <- arrange(WorldPopulation_selct_count10_td, Country_Name, Year, Population)
ggplot(WorldPopulation_selct_count10_td, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + coord_flip() + geom_bar(stat="identity",position="dodge",fill="lightblue") + ggtitle("World Population Country") + labs(x="Country", y="Population")