Can College Graduation Rate be used as precdictor for number of jobs in county (James Topor’s )
jobdata<-read.csv("https://raw.githubusercontent.com/vbriot28/datascienceCUNY_607/master/CUNY%20607_Project2_dataset2.csv", header =TRUE)
library(tidyr)## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)## Warning: package 'dplyr' was built under R version 3.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
jobdata1<-tbl_df(jobdata)
# split the table to use make it long table
str(jobdata1)## Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 11 variables:
## $ County : Factor w/ 5 levels "Autauga","Baldwin",..: 1 2 3 4 5
## $ LandArea : int 599 1578 891 625 639
## $ NatAmenity : int 4 4 4 3 4
## $ College1970: num 0.064 0.065 0.073 0.042 0.027
## $ College1980: num 0.121 0.121 0.092 0.049 0.053
## $ College1990: num 0.145 0.168 0.118 0.047 0.07
## $ College2000: num 0.18 0.231 0.109 0.071 0.096
## $ Jobs1970 : int 6853 19749 9448 3965 7587
## $ Jobs1980 : int 11278 27861 9755 4276 9490
## $ Jobs1990 : int 11471 40809 12163 5564 11811
## $ Jobs2000 : int 16289 70247 15197 6098 16503
coldata<- select(jobdata, 1:7)
jobdata1 <- select(jobdata,County, 8:11)
names(coldata) <- c("County", "LandArea", "NatAmenity", "1970", "1980", "1990", "2000")
names(jobdata1) <- c("County", "1970", "1980", "1990", "2000")# make long table
long_col <- gather(coldata,yr,graduation, 4:7)
long_job<-gather(jobdata1,yr, job, 2:5)
#join this two
col_job<- inner_join(long_col, long_job, by = c("yr","County"))
str(long_col$yr)## chr [1:20] "1970" "1970" "1970" "1970" "1970" "1980" ...
library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.2.3
ggplot(col_job, aes(x=graduation, y=job,color= yr)) + geom_line() +geom_point()# data suggests there is a threshold of graduation rate after that with increase of graduation rate there is a increase in number of jobsData set 2: New York citis leading causes of death
Show the leading causes of death each year for men and women.
data_death<-read.csv("Death.csv", header=T, stringsAsFactors = FALSE)
head(data_death,5)## Year Ethnicity Sex Cause.of.Death
## 1 2010 NON-HISPANIC BLACK MALE HUMAN IMMUNODEFICIENCY VIRUS DISEASE
## 2 2010 NON-HISPANIC BLACK MALE INFLUENZA AND PNEUMONIA
## 3 2010 NON-HISPANIC BLACK MALE INTENTIONAL SELF-HARM (SUICIDE)
## 4 2010 NON-HISPANIC BLACK MALE MALIGNANT NEOPLASMS
## 5 2010 NON-HISPANIC BLACK MALE MENTAL DISORDERS DUE TO USE OF ALCOHOL
## Count Percent
## 1 297 5
## 2 201 3
## 3 64 1
## 4 1540 23
## 5 50 1
# select by Cause of masimum number of Death overall for Male
summarise_each(filter(select(data_death, Year,Sex,Cause.of.Death,Count),Sex=="MALE"),funs(max))## Year Sex Cause.of.Death Count
## 1 2011 MALE VIRAL HEPATITIS 5632
# select by Cause of masimum number of Death overall for Female
summarise_each(filter(select(data_death, Year,Sex,Cause.of.Death,Count),Sex=="FEMALE"),funs(max))## Year Sex Cause.of.Death Count
## 1 2011 FEMALE VIRAL HEPATITIS 7050
# % Death by year and Sex
library(dplyr)
reason<-group_by(select(data_death,Year,Ethnicity,Sex,Cause.of.Death,Count),Year,Sex)
head(reason,10)## Source: local data frame [10 x 5]
## Groups: Year, Sex [1]
##
## Year Ethnicity Sex
## (int) (chr) (chr)
## 1 2010 NON-HISPANIC BLACK MALE
## 2 2010 NON-HISPANIC BLACK MALE
## 3 2010 NON-HISPANIC BLACK MALE
## 4 2010 NON-HISPANIC BLACK MALE
## 5 2010 NON-HISPANIC BLACK MALE
## 6 2010 NON-HISPANIC BLACK MALE
## 7 2010 NON-HISPANIC BLACK MALE
## 8 2010 NON-HISPANIC BLACK MALE
## 9 2010 NON-HISPANIC BLACK MALE
## 10 2010 NON-HISPANIC BLACK MALE
## Variables not shown: Cause.of.Death (chr), Count (int)
# % of death by Year and Sex
reason1<-mutate(filter(reason,Year==2010,Sex=="MALE"),Percent=Count*100/sum(Count))
head(reason1,10)## Source: local data frame [10 x 6]
## Groups: Year, Sex [1]
##
## Year Ethnicity Sex
## (int) (chr) (chr)
## 1 2010 NON-HISPANIC BLACK MALE
## 2 2010 NON-HISPANIC BLACK MALE
## 3 2010 NON-HISPANIC BLACK MALE
## 4 2010 NON-HISPANIC BLACK MALE
## 5 2010 NON-HISPANIC BLACK MALE
## 6 2010 NON-HISPANIC BLACK MALE
## 7 2010 NON-HISPANIC BLACK MALE
## 8 2010 NON-HISPANIC BLACK MALE
## 9 2010 NON-HISPANIC BLACK MALE
## 10 2010 NON-HISPANIC BLACK MALE
## Variables not shown: Cause.of.Death (chr), Count (int), Percent (dbl)
# top reasons for death by Ethinicty and Year and Sex
data_death %>% group_by(Ethnicity, Year, Sex) %>%
filter(Percent == max(Percent)) %>%
arrange(Ethnicity, Year)## Source: local data frame [120 x 6]
## Groups: Ethnicity, Year, Sex [40]
##
## Year Ethnicity Sex Cause.of.Death Count Percent
## (int) (chr) (chr) (chr) (int) (int)
## 1 2007 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 428 33
## 2 2007 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 428 33
## 3 2007 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 428 33
## 4 2007 ASIAN & PACIFIC ISLANDER MALE MALIGNANT NEOPLASMS 528 33
## 5 2007 ASIAN & PACIFIC ISLANDER MALE MALIGNANT NEOPLASMS 528 33
## 6 2007 ASIAN & PACIFIC ISLANDER MALE MALIGNANT NEOPLASMS 528 33
## 7 2008 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 456 34
## 8 2008 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 456 34
## 9 2008 ASIAN & PACIFIC ISLANDER FEMALE DISEASES OF HEART 456 34
## 10 2008 ASIAN & PACIFIC ISLANDER MALE DISEASES OF HEART 546 33
## .. ... ... ... ... ... ...
Data set 3: App download data Question to answer which App was downloaded most number of times for ios and android
app_data<-read.csv("appdata.csv")
str(app_data)## 'data.frame': 12 obs. of 17 variables:
## $ App.Name : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
## $ Type : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
## $ Paid.Free : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
## $ Release.Date : Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
## $ X2010 : int NA 4 10 NA NA NA NA NA NA 143 ...
## $ X2011 : int NA 8 124 NA NA NA 18 104 NA 156 ...
## $ X2012 : int 8 58 320 23 NA 24 102 204 NA 100 ...
## $ X2013 : int 56 102 547 123 16 123 246 382 230 93 ...
## $ X2014 : int 60 126 648 202 58 234 306 485 403 85 ...
## $ X2015 : int 76 148 627 303 128 345 378 324 503 86 ...
## $ Release.Date.1: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 5 11 4 9 10 3 8 7 1 12 ...
## $ X2010.1 : int NA 1 2 NA NA NA NA NA NA 146 ...
## $ X2011.1 : int NA 9 108 NA NA NA NA 108 NA 173 ...
## $ X2012.1 : int 2 64 312 26 NA NA 108 205 NA 112 ...
## $ X2013.1 : int 53 108 538 128 18 4 254 398 253 105 ...
## $ X2014.1 : int 64 132 647 236 64 143 302 476 493 95 ...
## $ X2015.1 : int 72 165 656 329 294 256 402 389 523 92 ...
apple<-select(app_data,1:10)
str(apple)## 'data.frame': 12 obs. of 10 variables:
## $ App.Name : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
## $ Type : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
## $ Paid.Free : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
## $ Release.Date: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
## $ X2010 : int NA 4 10 NA NA NA NA NA NA 143 ...
## $ X2011 : int NA 8 124 NA NA NA 18 104 NA 156 ...
## $ X2012 : int 8 58 320 23 NA 24 102 204 NA 100 ...
## $ X2013 : int 56 102 547 123 16 123 246 382 230 93 ...
## $ X2014 : int 60 126 648 202 58 234 306 485 403 85 ...
## $ X2015 : int 76 148 627 303 128 345 378 324 503 86 ...
names(apple)<-c("App_Name","Type","Paid.Free","Release.Date","2010","2011","2012","2013","2014","2015")
android<-select(app_data,1:3,11:17)
str(android)## 'data.frame': 12 obs. of 10 variables:
## $ App.Name : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
## $ Type : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
## $ Paid.Free : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
## $ Release.Date.1: Factor w/ 12 levels "1/16/2013","10/14/2015",..: 5 11 4 9 10 3 8 7 1 12 ...
## $ X2010.1 : int NA 1 2 NA NA NA NA NA NA 146 ...
## $ X2011.1 : int NA 9 108 NA NA NA NA 108 NA 173 ...
## $ X2012.1 : int 2 64 312 26 NA NA 108 205 NA 112 ...
## $ X2013.1 : int 53 108 538 128 18 4 254 398 253 105 ...
## $ X2014.1 : int 64 132 647 236 64 143 302 476 493 95 ...
## $ X2015.1 : int 72 165 656 329 294 256 402 389 523 92 ...
names(android)<-c("App_Name","Type","Paid.Free","Release.Date1","2010","2011","2012","2013","2014","2015")
apple_dow<-gather(apple, "year", "downloads_app", 5:10)
apple_dow $ downloads_app[is.na(apple_dow $ downloads_app)]<-0
android_dow<-gather(android, "year", "downloads_and", 5:10)
android_dow $ downloads_and[is.na(android_dow $ downloads_and)]<-0str(apple_dow)## 'data.frame': 72 obs. of 6 variables:
## $ App_Name : Factor w/ 12 levels "Angry Birds",..: 3 6 1 9 5 4 10 2 11 12 ...
## $ Type : Factor w/ 1 level "Game": 1 1 1 1 1 1 1 1 1 1 ...
## $ Paid.Free : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
## $ Release.Date : Factor w/ 12 levels "1/16/2013","10/14/2015",..: 6 7 4 8 9 10 11 5 1 12 ...
## $ year : chr "2010" "2010" "2010" "2010" ...
## $ downloads_app: num 0 4 10 0 0 0 0 0 0 143 ...
apple_dow %>% group_by(year) %>% filter(downloads_app == max(downloads_app)) %>%arrange(year)## Source: local data frame [6 x 6]
## Groups: year [6]
##
## App_Name Type Paid.Free Release.Date year downloads_app
## (fctr) (fctr) (fctr) (fctr) (chr) (dbl)
## 1 Words With Friends Game Free 9-Jul 2010 143
## 2 Words With Friends Game Free 9-Jul 2011 156
## 3 Angry Birds Game Free 12/11/2009 2012 320
## 4 Angry Birds Game Free 12/11/2009 2013 547
## 5 Angry Birds Game Free 12/11/2009 2014 648
## 6 Angry Birds Game Free 12/11/2009 2015 627
library(ggplot2)
ggplot(apple_dow, aes(y=downloads_app, x=year,color=App_Name)) + geom_line()+geom_point()## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
android_dow %>% group_by(year) %>%filter(downloads_and == max(downloads_and)) %>%
arrange(year)## Source: local data frame [6 x 6]
## Groups: year [6]
##
## App_Name Type Paid.Free Release.Date1 year downloads_and
## (fctr) (fctr) (fctr) (fctr) (chr) (dbl)
## 1 Words With Friends Game Free 9-Jul 2010 146
## 2 Words With Friends Game Free 9-Jul 2011 173
## 3 Angry Birds Game Free 11/19/2010 2012 312
## 4 Angry Birds Game Free 11/19/2010 2013 538
## 5 Angry Birds Game Free 11/19/2010 2014 647
## 6 Angry Birds Game Free 11/19/2010 2015 656
ggplot(android_dow, aes(y=downloads_and, x=year,color=App_Name)) + geom_line() +geom_point() ## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
working with NAB data file-
Most win by a team by year
Load data file
NBA_data<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/leagues_NBA_wins_active.csv", header=TRUE,stringsAsFactors = FALSE)# Get rid of total values last column
NBA_data<-NBA_data[1:73,]
library(tidyr)
NBA_final<-NBA_data%>% gather(team, win, ATL:WAS)
NBA_final$win[NBA_final$win==""]<-0
# remove non numeric values
NBA_final$win<-as.numeric(NBA_final$win)## Warning: NAs introduced by coercion
NBA_final$win[is.na(NBA_final$win)]<-0
unique(NBA_final$Lg)## [1] "NBA" "Lg" "BAA"
NBA_final$Lg[(NBA_final$Lg!="NBA")]<-c("NBA")# summarize data and visualization
arrange(summarize(group_by(select(NBA_final,team,win),team),count=sum(win)),desc(count))## Source: local data frame [30 x 2]
##
## team count
## (chr) (dbl)
## 1 LAL 3232
## 2 BOS 3211
## 3 PHI 2724
## 4 NYK 2695
## 5 ATL 2638
## 6 DET 2604
## 7 GSW 2573
## 8 SAC 2456
## 9 OKC 2127
## 10 PHO 2116
## .. ... ...
NBA_final %>% group_by(Season) %>%filter(win == max(win)) %>%
arrange((Season), desc(Season))## Source: local data frame [169 x 5]
## Groups: Season [71]
##
## Rk Season Lg team win
## (chr) (chr) (chr) (chr) (dbl)
## 1 70 1946-47 NBA GSW 35
## 2 69 1947-48 NBA GSW 27
## 3 68 1948-49 NBA SAC 45
## 4 67 1949-50 NBA LAL 51
## 5 67 1949-50 NBA PHI 51
## 6 67 1949-50 NBA SAC 51
## 7 66 1950-51 NBA LAL 44
## 8 65 1951-52 NBA SAC 41
## 9 64 1952-53 NBA LAL 48
## 10 63 1953-54 NBA LAL 46
## .. ... ... ... ... ...
NBA_final %>% group_by(Season) %>%filter(win == max(win)) %>%
arrange(Rk,team)## Source: local data frame [169 x 5]
## Groups: Season [71]
##
## Rk Season Lg team win
## (chr) (chr) (chr) (chr) (dbl)
## 1 70 1946-47 NBA GSW 35
## 2 69 1947-48 NBA GSW 27
## 3 68 1948-49 NBA SAC 45
## 4 67 1949-50 NBA LAL 51
## 5 67 1949-50 NBA PHI 51
## 6 67 1949-50 NBA SAC 51
## 7 66 1950-51 NBA LAL 44
## 8 65 1951-52 NBA SAC 41
## 9 64 1952-53 NBA LAL 48
## 10 63 1953-54 NBA LAL 46
## .. ... ... ... ... ...