IS607 Project 2

Load Required Libraries

##Load libraries
library(dplyr) #all data
library(tidyr) #all data
library(readxl)#Import data 2
library(ggplot2)#all data
library(plotrix) #data1
library(plotly) #data 2 https://plot.ly/ggplot2/animations/
library(gapminder) #data2 https://plot.ly/ggplot2/animations/

Data1 posted by Michael Silva (Adult Arrests)

https://data.ny.gov/Public-Safety/Adult-Arrests-by-County-Beginning-1970/rikd-mt35/data.

Get Data 1

theurl <- "https://data.ny.gov/api/views/rikd-mt35/rows.csv?accessType=DOWNLOAD"
thedata <- read.table(file = theurl, header = TRUE, sep = ",")
glimpse(thedata)

## Observations: 3,055
## Variables: 13
## $ County            <fct> Albany, Albany, Albany, Albany, Albany, Alba...
## $ Year              <int> 1970, 1971, 1972, 1973, 1974, 1975, 1976, 19...
## $ Total             <int> 1226, 1833, 3035, 3573, 4255, 4173, 4601, 48...
## $ Felony.Total      <int> 688, 829, 1054, 1134, 1329, 1259, 1435, 1342...
## $ Drug.Felony       <int> 97, 131, 211, 244, 281, 209, 201, 122, 85, 1...
## $ Violent.Felony    <int> 191, 231, 256, 274, 308, 344, 434, 403, 433,...
## $ DWI.Felony        <int> 5, 6, 8, 28, 17, 12, 26, 45, 58, 65, 79, 81,...
## $ Other.Felony      <int> 395, 461, 579, 588, 723, 694, 774, 772, 909,...
## $ Misdemeanor.Total <int> 538, 1004, 1981, 2439, 2926, 2914, 3166, 347...
## $ Drug.Misd         <int> 207, 204, 285, 369, 437, 398, 362, 270, 157,...
## $ DWI.Misd          <int> 48, 111, 297, 497, 619, 463, 574, 858, 1540,...
## $ Property.Misd     <int> 95, 272, 541, 668, 885, 977, 1011, 1133, 133...
## $ Other.Misd        <int> 188, 417, 858, 905, 985, 1076, 1219, 1216, 1...

Tidy Data Offense

Gather all offense totals and display GGPlots

head(thedata)

##   County Year Total Felony.Total Drug.Felony Violent.Felony DWI.Felony
## 1 Albany 1970  1226          688          97            191          5
## 2 Albany 1971  1833          829         131            231          6
## 3 Albany 1972  3035         1054         211            256          8
## 4 Albany 1973  3573         1134         244            274         28
## 5 Albany 1974  4255         1329         281            308         17
## 6 Albany 1975  4173         1259         209            344         12
##   Other.Felony Misdemeanor.Total Drug.Misd DWI.Misd Property.Misd
## 1          395               538       207       48            95
## 2          461              1004       204      111           272
## 3          579              1981       285      297           541
## 4          588              2439       369      497           668
## 5          723              2926       437      619           885
## 6          694              2914       398      463           977
##   Other.Misd
## 1        188
## 2        417
## 3        858
## 4        905
## 5        985
## 6       1076

tddata1.1<-thedata%>%
    select( -Felony.Total, -Misdemeanor.Total, -Year, -Total) %>%
    gather(offense, value, 2:9)%>%
    group_by(offense)%>%
    summarise_each(funs(sum), value)%>%
    mutate(pctallcrime = round(value/sum(thedata$Total)*100,5))%>%
    arrange(desc(value))
tddata1.1

## # A tibble: 8 x 3
##   offense          value pctallcrime
##   <chr>            <int>       <dbl>
## 1 Other.Misd     4946081      22.4  
## 2 Property.Misd  4659585      21.1  
## 3 Other.Felony   3358148      15.2  
## 4 Drug.Misd      2862559      13.0  
## 5 Violent.Felony 2471367      11.2  
## 6 DWI.Misd       1925200       8.72 
## 7 Drug.Felony    1648439       7.47 
## 8 DWI.Felony      198436       0.899

GGPlot

options("scipen" = 20)
lbls <- paste(tddata1.1$offense, round(tddata1.1$pctallcrime,0)) 
lbls <- paste(lbls,"%",sep="") 
pie(tddata1.1$value, labels = lbls, col = rainbow(length(tddata1.1$value)),  main="Pie Chart of Offenses")

ggplot(data= tddata1.1) +
    geom_point(mapping = aes(x = pctallcrime, y = value, size = pctallcrime, color = offense))

ggplot(data= tddata1.1) +
    geom_bar(mapping = aes(x = offense, y = value, fill= offense), stat = "identity", position = "identity")+
#theme(axis.text.x=element_text(angle=-90))
theme(axis.text.x = element_blank())

Tidy Data by Top 10 Counties

Gather all offense totals and display GGplots by Counties

tddata1.2<-thedata%>%
    select( -Felony.Total, -Misdemeanor.Total, -Year, -Total) %>%
    gather(offense, value, 2:9)%>%
    group_by(County)%>%
    summarise_each(funs(sum), value)%>%
    mutate(pctallcrime = round(value/sum(thedata$Total)*100,5))%>%
    arrange(desc(value))%>%
    top_n(10, value)
tddata1.2

## # A tibble: 10 x 3
##    County        value pctallcrime
##    <fct>         <int>       <dbl>
##  1 New York    4112305       18.6 
##  2 Kings       3289267       14.9 
##  3 Bronx       2596596       11.8 
##  4 Queens      1881895        8.53
##  5 Erie        1100599        4.99
##  6 Suffolk     1030156        4.67
##  7 Nassau       816633        3.70
##  8 Monroe       766274        3.47
##  9 Westchester  704951        3.19
## 10 Onondaga     492757        2.23

GGPlot

lbls <- paste(tddata1.2$County, round(tddata1.1$pctallcrime,0)) 
lbls <- paste(lbls,"%",sep="") 
pie(tddata1.2$value, labels = lbls, col = rainbow(length(tddata1.2$value)),  main="Pie Chart by Top 10 County")

ggplot(data= tddata1.2) +
    geom_point(mapping = aes(x = pctallcrime, y = value, size = pctallcrime, color = County))

tddata1.3<-thedata%>%
    select( -Felony.Total, -Misdemeanor.Total, -Year, -Total) %>%
    gather(offense, value, 2:9)%>%
    group_by(County, offense)%>%
    summarise_each(funs(sum), value)%>%
    mutate(pctallcrime = round(value/sum(thedata$Total)*100,5))%>%
    arrange(desc(value))

ggplot(data= tddata1.3) +
    geom_bar(mapping = aes(x = County, y = value, fill= offense), stat = "identity", position = "identity")+
    theme(axis.text.x=element_text(angle=-90, size=9))

Data2 posted by Juanelle Marks (Population Migration)

http://www.un.org/en/development/desa/population/migration/data/estimates2/estimates17.shtml

Get Data 2

theurl2 <- "http://www.un.org/en/development/desa/population/migration/data/empirical2/data/UN_MigFlow_Totals.xlsx"
destfile <- "UN_MigFlow_Totals.xlsx"
download.file(theurl2, destfile, mode = "wb")
thedata2<-read_xlsx("UN_MigFlow_Totals.xlsx", skip = 16)
glimpse(thedata2)

## Observations: 229
## Variables: 38
## $ CntName  <chr> "Armenia", "Armenia", "Australia", "Australia", "Aust...
## $ Criteria <chr> "Residence", "Residence", "Residence", "Residence", "...
## $ Type     <chr> "Emigrants", "Immigrants", "Emigrants", "Immigrants",...
## $ Coverage <chr> "Both", "Both", "Both", "Both", "Citizens", "Foreigne...
## $ `1980`   <chr> "..", "..", "90860", "184290", "..", "..", "..", ".."...
## $ `1981`   <chr> "..", "..", "85600", "212690", "..", "..", "..", ".."...
## $ `1982`   <chr> "..", "..", "92340", "195200", "..", "..", "..", ".."...
## $ `1983`   <chr> "..", "..", "100510", "153570", "..", "..", "..", ".....
## $ `1984`   <chr> "..", "..", "96360", "153530", "..", "..", "..", ".."...
## $ `1985`   <chr> "..", "..", "93440", "172550", "..", "..", "..", ".."...
## $ `1986`   <chr> "..", "..", "92450", "196690", "..", "..", "..", ".."...
## $ `1987`   <chr> "..", "..", "97770", "221620", "..", "..", "..", ".."...
## $ `1988`   <chr> "..", "..", "104770", "253860", "..", "..", "..", ".....
## $ `1989`   <chr> "..", "..", "120040", "238050", "..", "..", "..", ".....
## $ `1990`   <chr> "..", "..", "137470", "234050", "..", "..", "..", ".....
## $ `1991`   <chr> "..", "..", "143710", "237240", "..", "..", "..", ".....
## $ `1992`   <chr> "..", "..", "143660", "220460", "..", "..", "..", ".....
## $ `1993`   <chr> "..", "..", "140420", "197940", "..", "..", "..", ".....
## $ `1994`   <chr> "..", "..", "141680", "221920", "..", "..", "..", ".....
## $ `1995`   <chr> "..", "..", "149360", "253940", "..", "..", "..", ".....
## $ `1996`   <chr> "..", "..", "158260", "261330", "17136", "46725", "12...
## $ `1997`   <chr> "..", "..", "176560", "260220", "18830", "48264", "13...
## $ `1998`   <chr> "..", "..", "179600", "268390", "19407", "44865", "13...
## $ `1999`   <chr> "..", "..", "185670", "289870", "19644", "47279", "14...
## $ `2000`   <chr> "12030", "1767", "206120", "317560", "18224", "46248"...
## $ `2001`   <chr> "11901", "1764", "216130", "356410", "21644", "51010"...
## $ `2002`   <chr> "10433", "1715", "222940", "361990", "30353", "44478"...
## $ `2003`   <chr> "8482", "1926", "224890", "388450", "23056", "48940",...
## $ `2004`   <chr> "8451", "1514", "212200", "350990", "21703", "50018",...
## $ `2005`   <chr> "9303", "1497", "206690", "363470", "20333", "49800",...
## $ `2006`   <chr> "8053", "1335", "204800", "402210", "19387", "55045",...
## $ `2007`   <chr> "7461", "1112", "216580", "460650", "17828", "32070",...
## $ `2008`   <chr> "6121", "864", "220280", "535970", "18168", "33395", ...
## $ `2009`   <chr> "4100", "861", "..", "..", "16376", "36868", "8988", ...
## $ `2010`   <chr> "..", "..", "..", "..", "16059", "35592", "8817", "62...
## $ `2011`   <chr> "..", "..", "..", "..", "14401", "36796", "8082", "74...
## $ `2012`   <chr> "..", "..", "..", "..", "15443", "36369", "8272", "83...
## $ `2013`   <chr> "..", "..", "..", "..", "15368", "38703", "9237", "92...

Tidy Data by Year Totals

Gather all year totals and display dynamic year to year changes in plotly

head(thedata2)

## # A tibble: 6 x 38
##   CntName  Criteria   Type    Coverage  `1980` `1981` `1982` `1983` `1984`
##   <chr>    <chr>      <chr>   <chr>     <chr>  <chr>  <chr>  <chr>  <chr> 
## 1 Armenia  Residence  Emigra~ Both      ..     ..     ..     ..     ..    
## 2 Armenia  Residence  Immigr~ Both      ..     ..     ..     ..     ..    
## 3 Austral~ Residence  Emigra~ Both      90860  85600  92340  100510 96360 
## 4 Austral~ Residence  Immigr~ Both      184290 212690 195200 153570 153530
## 5 Austria  Citizensh~ Emigra~ Citizens  ..     ..     ..     ..     ..    
## 6 Austria  Citizensh~ Emigra~ Foreigne~ ..     ..     ..     ..     ..    
## # ... with 29 more variables: `1985` <chr>, `1986` <chr>, `1987` <chr>,
## #   `1988` <chr>, `1989` <chr>, `1990` <chr>, `1991` <chr>, `1992` <chr>,
## #   `1993` <chr>, `1994` <chr>, `1995` <chr>, `1996` <chr>, `1997` <chr>,
## #   `1998` <chr>, `1999` <chr>, `2000` <chr>, `2001` <chr>, `2002` <chr>,
## #   `2003` <chr>, `2004` <chr>, `2005` <chr>, `2006` <chr>, `2007` <chr>,
## #   `2008` <chr>, `2009` <chr>, `2010` <chr>, `2011` <chr>, `2012` <chr>,
## #   `2013` <chr>

tddata2<-thedata2%>%
    gather(year, total, "1980":"2013")%>%
    filter(total != "..")

tddata2$total<- as.integer(tddata2$total)
    
tddata2<- tddata2%>%
    group_by(CntName,year)%>%
    summarise_each(funs(sum), total)%>%
    arrange(year)#%>%
    #filter(CntName %in% c("United States of America","Germany","France"))
#
#View(tddata2)
#tddata2 = NULL

GGPlot

1990 -1991 USSR Colapses, USSR Migration Explodes.

2008 Black Market, Market Crashes, Germany migration Declines

require(plotly)
p <- ggplot(tddata2, aes(total/1000000, year, color = reorder(CntName, -total)))+
  geom_point(aes(size = total, frame = year, ids = CntName))+
  scale_fill_continuous()+
  labs(color = "Country Names", x = "Migration by Millions", y = "Years")

p <- ggplotly(p) %>%
    animation_opts(2000, easing = "elastic", redraw = TRUE)%>%
    add_annotations(
    yref="paper", 
    xref="paper", 
    y=1.1, 
    x=0, 
    text="Migration By Year/Country", 
    showarrow=F, 
    font=list(size=17)
  ) %>% 
  layout(title=FALSE)
p

# p = NULL

Data2 posted by Ravi Itwaru (Airline Safety)

https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv

Get data 3

theurl3<-"https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv"
thedata3 <- read.table(file = theurl3, header = TRUE, sep = ",") 
glimpse(thedata3)

## Observations: 56
## Variables: 8
## $ airline                <fct> Aer Lingus, Aeroflot*, Aerolineas Argen...
## $ avail_seat_km_per_week <dbl> 320906734, 1197672318, 385803648, 59687...
## $ incidents_85_99        <int> 2, 76, 6, 3, 2, 14, 2, 3, 5, 7, 3, 21, ...
## $ fatal_accidents_85_99  <int> 0, 14, 0, 1, 0, 4, 1, 0, 0, 2, 1, 5, 0,...
## $ fatalities_85_99       <int> 0, 128, 0, 64, 0, 79, 329, 0, 0, 50, 1,...
## $ incidents_00_14        <int> 0, 6, 1, 5, 2, 6, 4, 5, 5, 4, 7, 17, 1,...
## $ fatal_accidents_00_14  <int> 0, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 3, 0, ...
## $ fatalities_00_14       <int> 0, 88, 0, 0, 0, 337, 158, 7, 88, 0, 0, ...

Tidy Data by Airline Incident Counts

Gather all incident totals and dipsplay GGPlot with Top 10 Airlines Incidents,then compare incidents to kilometers per week.

head(thedata3)

##                 airline avail_seat_km_per_week incidents_85_99
## 1            Aer Lingus              320906734               2
## 2             Aeroflot*             1197672318              76
## 3 Aerolineas Argentinas              385803648               6
## 4           Aeromexico*              596871813               3
## 5            Air Canada             1865253802               2
## 6            Air France             3004002661              14
##   fatal_accidents_85_99 fatalities_85_99 incidents_00_14
## 1                     0                0               0
## 2                    14              128               6
## 3                     0                0               1
## 4                     1               64               5
## 5                     0                0               2
## 6                     4               79               6
##   fatal_accidents_00_14 fatalities_00_14
## 1                     0                0
## 2                     1               88
## 3                     0                0
## 4                     0                0
## 5                     0                0
## 6                     2              337

tddata3<-thedata3%>%
    gather(incidents, count, 3:8)
    
tddata3$count<- as.integer(tddata3$count)

tddata3<-tddata3%>%
    filter(count > 0)%>%
    select(airline, count, avail_seat_km_per_week)%>%
    group_by(airline)%>%
    summarise_each(funs(sum), count, avail_seat_km_per_week)%>%
    arrange(desc(count))%>%
    top_n(10)

GGPlot Top 10 Airport Incidents

ggplot(data= tddata3) +
    geom_bar(mapping = aes(x = reorder(airline, -count), y = count, fill= airline), stat = "identity", position = "identity", show.legend = FALSE)+
theme(axis.text.x=element_text(angle=-90))

ggplot(data= tddata3) +
    geom_point(mapping = aes(x = avail_seat_km_per_week, y =count, color = airline, size = count))

#theme(axis.text.x = element_blank())

IS607 Project 2

Anthony Pagan

October 7, 2018

Load Required Libraries

Data1 posted by Michael Silva (Adult Arrests)

https://data.ny.gov/Public-Safety/Adult-Arrests-by-County-Beginning-1970/rikd-mt35/data.

Get Data 1

Tidy Data Offense

GGPlot

Tidy Data by Top 10 Counties

GGPlot

Data2 posted by Juanelle Marks (Population Migration)

http://www.un.org/en/development/desa/population/migration/data/estimates2/estimates17.shtml

Get Data 2

Tidy Data by Year Totals

GGPlot

1990 -1991 USSR Colapses, USSR Migration Explodes.

2008 Black Market, Market Crashes, Germany migration Declines

Data2 posted by Ravi Itwaru (Airline Safety)

https://raw.githubusercontent.com/fivethirtyeight/data/master/airline-safety/airline-safety.csv

Get data 3

Tidy Data by Airline Incident Counts

GGPlot Top 10 Airport Incidents