library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

1.- Dataset (New York City Leading Causes of Death)

data_nyc_deaths <- read.csv("New_York_City_Leading_Causes_of_Death.csv",header = TRUE, stringsAsFactors = FALSE)
head(data_nyc_deaths)
##   Year          Ethnicity  Sex                              Cause.of.Death
## 1 2010 NON-HISPANIC BLACK MALE        HUMAN IMMUNODEFICIENCY VIRUS DISEASE
## 2 2010 NON-HISPANIC BLACK MALE                     INFLUENZA AND PNEUMONIA
## 3 2010 NON-HISPANIC BLACK MALE             INTENTIONAL SELF-HARM (SUICIDE)
## 4 2010 NON-HISPANIC BLACK MALE                         MALIGNANT NEOPLASMS
## 5 2010 NON-HISPANIC BLACK MALE      MENTAL DISORDERS DUE TO USE OF ALCOHOL
## 6 2010 NON-HISPANIC BLACK MALE NEPHRITIS, NEPHROTIC SYNDROME AND NEPHROSIS
##   Count Percent
## 1   297       5
## 2   201       3
## 3    64       1
## 4  1540      23
## 5    50       1
## 6    70       1
gender<- data_nyc_deaths %>%
  distinct %>%
  group_by(Year, Sex, Cause.of.Death) %>%
  tally(Count, sort=TRUE) %>%
  top_n(1, n) %>%
  arrange(Year)
gender
## # A tibble: 10 x 4
## # Groups:   Year, Sex [10]
##     Year Sex    Cause.of.Death        n
##    <int> <chr>  <chr>             <int>
##  1  2007 FEMALE DISEASES OF HEART 11618
##  2  2007 MALE   DISEASES OF HEART  9577
##  3  2008 FEMALE DISEASES OF HEART 11462
##  4  2008 MALE   DISEASES OF HEART  9456
##  5  2009 FEMALE DISEASES OF HEART 10630
##  6  2009 MALE   DISEASES OF HEART  9173
##  7  2010 FEMALE DISEASES OF HEART  9343
##  8  2010 MALE   DISEASES OF HEART  8344
##  9  2011 FEMALE DISEASES OF HEART  9009
## 10  2011 MALE   DISEASES OF HEART  7713
ethnic<- data_nyc_deaths %>%
  distinct %>%
  group_by(Year, Ethnicity, Cause.of.Death) %>%
  tally(Count, sort=TRUE) %>%
  top_n(1, n)
ethnic
## # A tibble: 20 x 4
## # Groups:   Year, Ethnicity [20]
##     Year Ethnicity                Cause.of.Death          n
##    <int> <chr>                    <chr>               <int>
##  1  2007 NON-HISPANIC WHITE       DISEASES OF HEART   12682
##  2  2008 NON-HISPANIC WHITE       DISEASES OF HEART   12339
##  3  2009 NON-HISPANIC WHITE       DISEASES OF HEART   11465
##  4  2010 NON-HISPANIC WHITE       DISEASES OF HEART    9846
##  5  2011 NON-HISPANIC WHITE       DISEASES OF HEART    9236
##  6  2007 NON-HISPANIC BLACK       DISEASES OF HEART    4843
##  7  2008 NON-HISPANIC BLACK       DISEASES OF HEART    4802
##  8  2009 NON-HISPANIC BLACK       DISEASES OF HEART    4603
##  9  2010 NON-HISPANIC BLACK       DISEASES OF HEART    4297
## 10  2011 NON-HISPANIC BLACK       DISEASES OF HEART    4083
## 11  2008 HISPANIC                 DISEASES OF HEART    2775
## 12  2007 HISPANIC                 DISEASES OF HEART    2745
## 13  2009 HISPANIC                 DISEASES OF HEART    2731
## 14  2010 HISPANIC                 DISEASES OF HEART    2671
## 15  2011 HISPANIC                 DISEASES OF HEART    2549
## 16  2009 ASIAN & PACIFIC ISLANDER DISEASES OF HEART    1004
## 17  2011 ASIAN & PACIFIC ISLANDER MALIGNANT NEOPLASMS  1004
## 18  2008 ASIAN & PACIFIC ISLANDER DISEASES OF HEART    1002
## 19  2010 ASIAN & PACIFIC ISLANDER MALIGNANT NEOPLASMS   943
## 20  2007 ASIAN & PACIFIC ISLANDER DISEASES OF HEART     925
cause<- data_nyc_deaths %>%
  distinct %>%
  filter(Year == 2010 | Year == 2011) %>%
  group_by(Year, Cause.of.Death) %>%
  tally(Count) %>%
  spread(Year, n) %>%
  mutate(years= ((`2011` - `2010`) / `2010`) * 100)

maj_death <- arrange(cause, desc(years))
min_death <- arrange(cause, years)
head(maj_death)
## # A tibble: 6 x 4
##   Cause.of.Death                                   `2010` `2011` years
##   <chr>                                             <int>  <int> <dbl>
## 1 TUBERCULOSIS                                          5     10 100  
## 2 ANEMIAS                                              44     67  52.3
## 3 PEPTIC ULCER                                         78     94  20.5
## 4 PSYCH. SUBSTANCE USE & ACCIDENTAL DRUG POISONING    648    748  15.4
## 5 MENTAL DISORDERS DUE TO USE OF ALCOHOL              178    200  12.4
## 6 CEREBROVASCULAR DISEASE                            1568   1740  11.0
head(min_death)
## # A tibble: 6 x 4
##   Cause.of.Death                               `2010` `2011`  years
##   <chr>                                         <int>  <int>  <dbl>
## 1 CHOLELITHIASIS AND DISORDERS OF GALLBLADDER      35     21 -40   
## 2 SHORT GESTATION/LBW                             139     92 -33.8 
## 3 ATHEROSCLEROSIS                                 197    143 -27.4 
## 4 CARDIOVASCULAR DISORDERS IN PERINATAL PERIOD     29     24 -17.2 
## 5 PREGNANCY, CHILDBIRTH AND THE PUERPERIUM         29     25 -13.8 
## 6 HUMAN IMMUNODEFICIENCY VIRUS DISEASE            817    747  -8.57

2.- Dataset (Shark Attack)

shark_data<- "https://raw.githubusercontent.com/jgarcia71/Data-607-Assignments/master/Shark%20attacks.csv"
shark.data <- read.csv(shark_data, stringsAsFactors = F)
str(shark.data)
## 'data.frame':    25614 obs. of  22 variables:
##  $ Case.Number           : chr  "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
##  $ Date                  : chr  "11-Jun-17" "10-Jun-17" "10-Jun-17" "Reported 07-Jun-2017" ...
##  $ Year                  : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ Type                  : chr  "Unprovoked" "Unprovoked" "Unprovoked" "Unprovoked" ...
##  $ Country               : chr  "AUSTRALIA" "AUSTRALIA" "USA" "UNITED KINGDOM" ...
##  $ Area                  : chr  "Western Australia" "Victoria" "Florida" "South Devon" ...
##  $ Location              : chr  "Point Casuarina, Bunbury" "Flinders, Mornington Penisula" "Ponce Inlet, Volusia County" "Bantham Beach" ...
##  $ Activity              : chr  "Body boarding" "Surfing" "Surfing" "Surfing " ...
##  $ Name                  : chr  "Paul Goff" "female" "Bryan Brock" "Rich Thomson" ...
##  $ Sex                   : chr  "M" "F" "M" "M" ...
##  $ Age                   : chr  "48" "" "19" "30" ...
##  $ Injury                : chr  "No injury, board bitten" "No injury, knocke off board" "Laceration to left foot" "Bruise to leg, cuts to hand sustained when he hit the shark" ...
##  $ Fatal..Y.N.           : chr  "N" "N" "N" "N" ...
##  $ Time                  : chr  "08h30" "15h45" "10h00" "" ...
##  $ Species               : chr  "White shark, 4 m" "7 gill shark" "" "3m shark, probably a smooth hound" ...
##  $ Investigator.or.Source: chr  "WA Today, 6/11/2017" "" "Daytona Beach News-Journal, 6/10/2017" "C. Moore, GSAF" ...
##  $ pdf                   : chr  "2017.06.11-Goff.pdf" "2017.06.10.b-Flinders.pdf" "2017.06.10.a-Brock.pdf" "2017.06.07.R-Thomson.pdf" ...
##  $ href.formula          : chr  "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.11-Goff.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.b-Flinders.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.a-Brock.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.07.R-Thomson.pdf" ...
##  $ href                  : chr  "http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/201"| __truncated__ "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.b-Flinders.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.10.a-Brock.pdf" "http://sharkattackfile.net/spreadsheets/pdf_directory/2017.06.07.R-Thomson.pdf" ...
##  $ Case.Number.1         : chr  "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
##  $ Case.Number.2         : chr  "2017.06.11" "2017.06.10.b" "2017.06.10.a" "2017.06.07.R" ...
##  $ original.order        : int  6095 6094 6093 6092 6091 6090 6089 6088 6087 6086 ...
tail(shark.data)
##       Case.Number Date Year Type Country Area Location Activity Name Sex
## 25609                    NA                                             
## 25610                    NA                                             
## 25611                    NA                                             
## 25612                    NA                                             
## 25613                    NA                                             
## 25614          xx        NA                                             
##       Age Injury Fatal..Y.N. Time Species Investigator.or.Source pdf
## 25609                                                               
## 25610                                                               
## 25611                                                               
## 25612                                                               
## 25613                                                               
## 25614                                                               
##       href.formula href Case.Number.1 Case.Number.2 original.order
## 25609                                                           NA
## 25610                                                           NA
## 25611                                                           NA
## 25612                                                           NA
## 25613                                                           NA
## 25614                                                           NA
shark.data <- shark.data %>% 
  filter(!(Case.Number %in% c("", NA, 0, "xx"))) %>% 
  select(-Case.Number.1) %>% 
  select(-Case.Number.2)
a <- shark.data %>% 
  filter(Year >= 1900 & Year < 2017) %>% 
  group_by(Year) %>% 
  summarise(Attacks = n())
plot(a, type = "l")

3.- Dataset (World Population)

World_Population <- read.csv("https://raw.githubusercontent.com/jgarcia71/Data-607-Assignments/master/World%20Population%20Data.csv", header=T, stringsAsFactors = F, skip=4)

View(World_Population)

WorldPopulation_selct <- select(World_Population, c(1,2,9,19,29,39,49,59))

names(WorldPopulation_selct )[1]<-paste("Country_Name")
names(WorldPopulation_selct )[2]<-paste("Country_Code")


WorldPopulation_selct_income <- filter(WorldPopulation_selct, grepl('income', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, grepl('Low|High|Middle|low|high|middle', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, !grepl('Middle East|OECD', Country_Name))
WorldPopulation_selct_income <- filter(WorldPopulation_selct_income, !grepl('&', Country_Name))
WorldPopulation_selct_tidy <- WorldPopulation_selct_income %>% gather("Year", "Population", 3:8)
WorldPopulation_selct_tidy %>% arrange(Country_Name,Year,Population) 
##                                  Country_Name Country_Code  Year
## 1                                 High income          HIC X1964
## 2                                 High income          HIC X1974
## 3                                 High income          HIC X1984
## 4                                 High income          HIC X1994
## 5                                 High income          HIC X2004
## 6                                 High income          HIC X2014
## 7                                  Low income          LIC X1964
## 8                                  Low income          LIC X1974
## 9                                  Low income          LIC X1984
## 10                                 Low income          LIC X1994
## 11                                 Low income          LIC X2004
## 12                                 Low income          LIC X2014
## 13                        Lower middle income          LMC X1964
## 14                        Lower middle income          LMC X1974
## 15                        Lower middle income          LMC X1984
## 16                        Lower middle income          LMC X1994
## 17                        Lower middle income          LMC X2004
## 18                        Lower middle income          LMC X2014
## 19                              Middle income          MIC X1964
## 20                              Middle income          MIC X1974
## 21                              Middle income          MIC X1984
## 22                              Middle income          MIC X1994
## 23                              Middle income          MIC X2004
## 24                              Middle income          MIC X2014
## 25 Sub-Saharan Africa (excluding high income)          SSA X1964
## 26 Sub-Saharan Africa (excluding high income)          SSA X1974
## 27 Sub-Saharan Africa (excluding high income)          SSA X1984
## 28 Sub-Saharan Africa (excluding high income)          SSA X1994
## 29 Sub-Saharan Africa (excluding high income)          SSA X2004
## 30 Sub-Saharan Africa (excluding high income)          SSA X2014
## 31                        Upper middle income          UMC X1964
## 32                        Upper middle income          UMC X1974
## 33                        Upper middle income          UMC X1984
## 34                        Upper middle income          UMC X1994
## 35                        Upper middle income          UMC X2004
## 36                        Upper middle income          UMC X2014
##    Population
## 1   823154587
## 2   913510969
## 3   992295742
## 4  1069147445
## 5  1145971873
## 6  1227211897
## 7   182077594
## 8   233887209
## 9   300220347
## 10  395678396
## 11  519630935
## 12  678572076
## 13 1024253768
## 14 1292662974
## 15 1640549014
## 16 2042431452
## 17 2439908661
## 18 2847559084
## 19 2252227568
## 20 2845473103
## 21 3470527013
## 22 4163965335
## 23 4774222573
## 24 5365538848
## 25  251530081
## 26  325914096
## 27  432508714
## 28  571754398
## 29  745705643
## 30  978926559
## 31 1227973800
## 32 1552810129
## 33 1829977999
## 34 2121533883
## 35 2334313912
## 36 2517979764
ggplot(WorldPopulation_selct_tidy, aes(x=Country_Name, y=Population, colour = Year, group = Year)) + geom_line(linetype="solid", color="blue", size=1) + geom_point(shape = 22, size = 3, fill = "yellow") + ggtitle("World Population") + labs(x="Income", y="Population")

WorldPopulation_selct_country <- filter(WorldPopulation_selct, !grepl('income', Country_Name))
WorldPopulation_selct_country <- filter(WorldPopulation_selct_country, !grepl('develop|debt|OECD|Fragile|classi|Other|World|Euro|Baltic', Country_Name))
WorldPopulation_selct_country <- filter(WorldPopulation_selct_country, !grepl('SAS|NAC', Country_Code))
WorldPopulation_selct_country <- arrange(WorldPopulation_selct_country, -X2014)
WorldPopulation_selct_count10 <- head(WorldPopulation_selct_country,10)
WorldPopulation_selct_count10_td <- WorldPopulation_selct_count10 %>% gather("Year", "Population", 3:8)
WorldPopulation_selct_count10_td <- arrange(WorldPopulation_selct_count10_td, Country_Name, Year, Population)
ggplot(WorldPopulation_selct_count10_td, aes(x=Country_Name, y=Population, colour = Year, group = Year)) +  coord_flip() + geom_bar(stat="identity",position="dodge",fill="lightblue") + ggtitle("World Population Country") + labs(x="Country", y="Population")