Project - Working with undity data

Can College Graduation Rate be used as precdictor for number of jobs in county (James Topor’s )

Load the data file http://www.theanalysisfactor.com/wide-and-long-data

jobdata<-read.csv("https://raw.githubusercontent.com/vbriot28/datascienceCUNY_607/master/CUNY%20607_Project2_dataset2.csv", header =TRUE)

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

jobdata1<-tbl_df(jobdata)

# split the table to use make it long table
str(jobdata1)

## Classes 'tbl_df', 'tbl' and 'data.frame':    5 obs. of  11 variables:
##  $ County     : Factor w/ 5 levels "Autauga","Baldwin",..: 1 2 3 4 5
##  $ LandArea   : int  599 1578 891 625 639
##  $ NatAmenity : int  4 4 4 3 4
##  $ College1970: num  0.064 0.065 0.073 0.042 0.027
##  $ College1980: num  0.121 0.121 0.092 0.049 0.053
##  $ College1990: num  0.145 0.168 0.118 0.047 0.07
##  $ College2000: num  0.18 0.231 0.109 0.071 0.096
##  $ Jobs1970   : int  6853 19749 9448 3965 7587
##  $ Jobs1980   : int  11278 27861 9755 4276 9490
##  $ Jobs1990   : int  11471 40809 12163 5564 11811
##  $ Jobs2000   : int  16289 70247 15197 6098 16503

coldata<- select(jobdata, 1:7)
jobdata1 <- select(jobdata,County, 8:11)


names(coldata) <- c("County", "LandArea", "NatAmenity", "1970", "1980", "1990", "2000")
names(jobdata1) <- c("County", "1970", "1980", "1990", "2000")

prepare long data file

# make long table
long_col <-  gather(coldata,yr,graduation, 4:7)
long_job<-gather(jobdata1,yr, job, 2:5)


#join this two

col_job<- inner_join(long_col, long_job, by = c("yr","County"))
str(long_col$yr)

##  chr [1:20] "1970" "1970" "1970" "1970" "1970" "1980" ...

Data visualization

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.3

ggplot(col_job, aes(x=graduation, y=job,color= yr)) + geom_line() +geom_point()

# data suggests there is a threshold of graduation rate after that with increase of graduation rate there is a increase in number of jobs

Data set 2: New York citis leading causes of death

Show the leading causes of death each year for men and women.

load the data file

data_death<-read.csv("Death.csv", header=T, stringsAsFactors = FALSE)
head(data_death,5)

##   Year          Ethnicity  Sex                         Cause.of.Death
## 1 2010 NON-HISPANIC BLACK MALE   HUMAN IMMUNODEFICIENCY VIRUS DISEASE
## 2 2010 NON-HISPANIC BLACK MALE                INFLUENZA AND PNEUMONIA
## 3 2010 NON-HISPANIC BLACK MALE        INTENTIONAL SELF-HARM (SUICIDE)
## 4 2010 NON-HISPANIC BLACK MALE                    MALIGNANT NEOPLASMS
## 5 2010 NON-HISPANIC BLACK MALE MENTAL DISORDERS DUE TO USE OF ALCOHOL
##   Count Percent
## 1   297       5
## 2   201       3
## 3    64       1
## 4  1540      23
## 5    50       1

Analysis on leading causes of death

# select by Cause of masimum number of Death overall for Male

summarise_each(filter(select(data_death, Year,Sex,Cause.of.Death,Count),Sex=="MALE"),funs(max))

##   Year  Sex  Cause.of.Death Count
## 1 2011 MALE VIRAL HEPATITIS  5632

# select by Cause of masimum number of Death overall for Female
summarise_each(filter(select(data_death, Year,Sex,Cause.of.Death,Count),Sex=="FEMALE"),funs(max))

##   Year    Sex  Cause.of.Death Count
## 1 2011 FEMALE VIRAL HEPATITIS  7050

# % Death by year and Sex
library(dplyr)
reason<-group_by(select(data_death,Year,Ethnicity,Sex,Cause.of.Death,Count),Year,Sex)
head(reason,10)

## Source: local data frame [10 x 5]
## Groups: Year, Sex [1]
## 
##     Year          Ethnicity   Sex
##    (int)              (chr) (chr)
## 1   2010 NON-HISPANIC BLACK  MALE
## 2   2010 NON-HISPANIC BLACK  MALE
## 3   2010 NON-HISPANIC BLACK  MALE
## 4   2010 NON-HISPANIC BLACK  MALE
## 5   2010 NON-HISPANIC BLACK  MALE
## 6   2010 NON-HISPANIC BLACK  MALE
## 7   2010 NON-HISPANIC BLACK  MALE
## 8   2010 NON-HISPANIC BLACK  MALE
## 9   2010 NON-HISPANIC BLACK  MALE
## 10  2010 NON-HISPANIC BLACK  MALE
## Variables not shown: Cause.of.Death (chr), Count (int)

# % of death by Year and Sex

reason1<-mutate(filter(reason,Year==2010,Sex=="MALE"),Percent=Count*100/sum(Count))
head(reason1,10)

## Source: local data frame [10 x 6]
## Groups: Year, Sex [1]
## 
##     Year          Ethnicity   Sex
##    (int)              (chr) (chr)
## 1   2010 NON-HISPANIC BLACK  MALE
## 2   2010 NON-HISPANIC BLACK  MALE
## 3   2010 NON-HISPANIC BLACK  MALE
## 4   2010 NON-HISPANIC BLACK  MALE
## 5   2010 NON-HISPANIC BLACK  MALE
## 6   2010 NON-HISPANIC BLACK  MALE
## 7   2010 NON-HISPANIC BLACK  MALE
## 8   2010 NON-HISPANIC BLACK  MALE
## 9   2010 NON-HISPANIC BLACK  MALE
## 10  2010 NON-HISPANIC BLACK  MALE
## Variables not shown: Cause.of.Death (chr), Count (int), Percent (dbl)

# top reasons for death by Ethinicty and Year and Sex
  data_death %>% group_by(Ethnicity, Year, Sex) %>%
  filter(Percent == max(Percent)) %>%
  arrange(Ethnicity, Year)

## Source: local data frame [120 x 6]
## Groups: Ethnicity, Year, Sex [40]
## 
##     Year                Ethnicity    Sex      Cause.of.Death Count Percent
##    (int)                    (chr)  (chr)               (chr) (int)   (int)
## 1   2007 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   428      33
## 2   2007 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   428      33
## 3   2007 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   428      33
## 4   2007 ASIAN & PACIFIC ISLANDER   MALE MALIGNANT NEOPLASMS   528      33
## 5   2007 ASIAN & PACIFIC ISLANDER   MALE MALIGNANT NEOPLASMS   528      33
## 6   2007 ASIAN & PACIFIC ISLANDER   MALE MALIGNANT NEOPLASMS   528      33
## 7   2008 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   456      34
## 8   2008 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   456      34
## 9   2008 ASIAN & PACIFIC ISLANDER FEMALE   DISEASES OF HEART   456      34
## 10  2008 ASIAN & PACIFIC ISLANDER   MALE   DISEASES OF HEART   546      33
## ..   ...                      ...    ...                 ...   ...     ...

Data set 3: App download data Question to answer which App was downloaded most number of times for ios and android

Load the data

app_data<-read.csv("appdata.csv")

str(app_data)

## 'data.frame':    12 obs. of  17 variables:
##  $ App.Name      : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
##  $ Type          : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Paid.Free     : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Release.Date  : Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
##  $ X2010         : int  NA 4 10 NA NA NA NA NA NA 143 ...
##  $ X2011         : int  NA 8 124 NA NA NA 18 104 NA 156 ...
##  $ X2012         : int  8 58 320 23 NA 24 102 204 NA 100 ...
##  $ X2013         : int  56 102 547 123 16 123 246 382 230 93 ...
##  $ X2014         : int  60 126 648 202 58 234 306 485 403 85 ...
##  $ X2015         : int  76 148 627 303 128 345 378 324 503 86 ...
##  $ Release.Date.1: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 5 11 4 9 10 3 8 7 1 12 ...
##  $ X2010.1       : int  NA 1 2 NA NA NA NA NA NA 146 ...
##  $ X2011.1       : int  NA 9 108 NA NA NA NA 108 NA 173 ...
##  $ X2012.1       : int  2 64 312 26 NA NA 108 205 NA 112 ...
##  $ X2013.1       : int  53 108 538 128 18 4 254 398 253 105 ...
##  $ X2014.1       : int  64 132 647 236 64 143 302 476 493 95 ...
##  $ X2015.1       : int  72 165 656 329 294 256 402 389 523 92 ...

data transformation on the file

apple<-select(app_data,1:10)

str(apple)

## 'data.frame':    12 obs. of  10 variables:
##  $ App.Name    : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
##  $ Type        : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Paid.Free   : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Release.Date: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
##  $ X2010       : int  NA 4 10 NA NA NA NA NA NA 143 ...
##  $ X2011       : int  NA 8 124 NA NA NA 18 104 NA 156 ...
##  $ X2012       : int  8 58 320 23 NA 24 102 204 NA 100 ...
##  $ X2013       : int  56 102 547 123 16 123 246 382 230 93 ...
##  $ X2014       : int  60 126 648 202 58 234 306 485 403 85 ...
##  $ X2015       : int  76 148 627 303 128 345 378 324 503 86 ...

names(apple)<-c("App_Name","Type","Paid.Free","Release.Date","2010","2011","2012","2013","2014","2015") 

android<-select(app_data,1:3,11:17)

str(android)

## 'data.frame':    12 obs. of  10 variables:
##  $ App.Name      : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
##  $ Type          : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Paid.Free     : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Release.Date.1: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 5 11 4 9 10 3 8 7 1 12 ...
##  $ X2010.1       : int  NA 1 2 NA NA NA NA NA NA 146 ...
##  $ X2011.1       : int  NA 9 108 NA NA NA NA 108 NA 173 ...
##  $ X2012.1       : int  2 64 312 26 NA NA 108 205 NA 112 ...
##  $ X2013.1       : int  53 108 538 128 18 4 254 398 253 105 ...
##  $ X2014.1       : int  64 132 647 236 64 143 302 476 493 95 ...
##  $ X2015.1       : int  72 165 656 329 294 256 402 389 523 92 ...

names(android)<-c("App_Name","Type","Paid.Free","Release.Date1","2010","2011","2012","2013","2014","2015") 


apple_dow<-gather(apple, "year", "downloads_app", 5:10)

apple_dow $ downloads_app[is.na(apple_dow $ downloads_app)]<-0


android_dow<-gather(android, "year", "downloads_and", 5:10)

android_dow $ downloads_and[is.na(android_dow $ downloads_and)]<-0

Summarize data of mostly dowloaded app

str(apple_dow)

## 'data.frame':    72 obs. of  6 variables:
##  $ App_Name     : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
##  $ Type         : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Paid.Free    : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Release.Date : Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
##  $ year         : chr  "2010" "2010" "2010" "2010" ...
##  $ downloads_app: num  0 4 10 0 0 0 0 0 0 143 ...

apple_dow %>% group_by(year) %>% filter(downloads_app == max(downloads_app)) %>%arrange(year)

## Source: local data frame [6 x 6]
## Groups: year [6]
## 
##             App_Name   Type Paid.Free Release.Date  year downloads_app
##               (fctr) (fctr)    (fctr)       (fctr) (chr)         (dbl)
## 1 Words With Friends   Game      Free        9-Jul  2010           143
## 2 Words With Friends   Game      Free        9-Jul  2011           156
## 3        Angry Birds   Game      Free   12/11/2009  2012           320
## 4        Angry Birds   Game      Free   12/11/2009  2013           547
## 5        Angry Birds   Game      Free   12/11/2009  2014           648
## 6        Angry Birds   Game      Free   12/11/2009  2015           627

library(ggplot2)

ggplot(apple_dow, aes(y=downloads_app, x=year,color=App_Name)) + geom_line()+geom_point()

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

 android_dow %>% group_by(year) %>%filter(downloads_and == max(downloads_and)) %>%
  arrange(year)

## Source: local data frame [6 x 6]
## Groups: year [6]
## 
##             App_Name   Type Paid.Free Release.Date1  year downloads_and
##               (fctr) (fctr)    (fctr)        (fctr) (chr)         (dbl)
## 1 Words With Friends   Game      Free         9-Jul  2010           146
## 2 Words With Friends   Game      Free         9-Jul  2011           173
## 3        Angry Birds   Game      Free    11/19/2010  2012           312
## 4        Angry Birds   Game      Free    11/19/2010  2013           538
## 5        Angry Birds   Game      Free    11/19/2010  2014           647
## 6        Angry Birds   Game      Free    11/19/2010  2015           656

ggplot(android_dow, aes(y=downloads_and, x=year,color=App_Name)) + geom_line() +geom_point()

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

working with NAB data file-

Most overall wins in franchise history
Most win by a team by year
Load data file

NBA_data<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/leagues_NBA_wins_active.csv", header=TRUE,stringsAsFactors = FALSE)

Transform data

#  Get rid of total values last column
NBA_data<-NBA_data[1:73,]


library(tidyr)
NBA_final<-NBA_data%>% gather(team, win, ATL:WAS)
NBA_final$win[NBA_final$win==""]<-0

# remove non numeric values
NBA_final$win<-as.numeric(NBA_final$win)

## Warning: NAs introduced by coercion

NBA_final$win[is.na(NBA_final$win)]<-0


unique(NBA_final$Lg)

## [1] "NBA" "Lg"  "BAA"

NBA_final$Lg[(NBA_final$Lg!="NBA")]<-c("NBA")

summarize data to find out answer to the question

# summarize data and visualization

arrange(summarize(group_by(select(NBA_final,team,win),team),count=sum(win)),desc(count))

## Source: local data frame [30 x 2]
## 
##     team count
##    (chr) (dbl)
## 1    LAL  3232
## 2    BOS  3211
## 3    PHI  2724
## 4    NYK  2695
## 5    ATL  2638
## 6    DET  2604
## 7    GSW  2573
## 8    SAC  2456
## 9    OKC  2127
## 10   PHO  2116
## ..   ...   ...

 NBA_final %>% group_by(Season) %>%filter(win == max(win)) %>%
  arrange((Season), desc(Season))

## Source: local data frame [169 x 5]
## Groups: Season [71]
## 
##       Rk  Season    Lg  team   win
##    (chr)   (chr) (chr) (chr) (dbl)
## 1     70 1946-47   NBA   GSW    35
## 2     69 1947-48   NBA   GSW    27
## 3     68 1948-49   NBA   SAC    45
## 4     67 1949-50   NBA   LAL    51
## 5     67 1949-50   NBA   PHI    51
## 6     67 1949-50   NBA   SAC    51
## 7     66 1950-51   NBA   LAL    44
## 8     65 1951-52   NBA   SAC    41
## 9     64 1952-53   NBA   LAL    48
## 10    63 1953-54   NBA   LAL    46
## ..   ...     ...   ...   ...   ...

 NBA_final %>% group_by(Season) %>%filter(win == max(win)) %>%
  arrange(Rk,team)

## Source: local data frame [169 x 5]
## Groups: Season [71]
## 
##       Rk  Season    Lg  team   win
##    (chr)   (chr) (chr) (chr) (dbl)
## 1     70 1946-47   NBA   GSW    35
## 2     69 1947-48   NBA   GSW    27
## 3     68 1948-49   NBA   SAC    45
## 4     67 1949-50   NBA   LAL    51
## 5     67 1949-50   NBA   PHI    51
## 6     67 1949-50   NBA   SAC    51
## 7     66 1950-51   NBA   LAL    44
## 8     65 1951-52   NBA   SAC    41
## 9     64 1952-53   NBA   LAL    48
## 10    63 1953-54   NBA   LAL    46
## ..   ...     ...   ...   ...   ...