This project covers three untidy datasets: Toronto police marijuana arrests, NIH fiscal year 2018 grant data, and Emmy award winners since the inception of the award. The goal is to load each dataset from a csv into R, tidy the data, transform it for analysis, and run an analysis.

Add necessary tidying, transforming, and data visualization libraries

library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(zoo)

Arrests

The first dataset, posted by Sie Song Wong, is a record of marijuana arrests in Toronto that contain the variables race, sex, employment status, and citizenship status. The original dataset was relatively tidy, so I used the pivot table function of Excel to put counts of each sex, male and female, as column headers. This created a wide dataset that would need to be tidied. The csv can be found here: https://github.com/hdupre/DATA607/blob/master/Project2/arrests.csv

Loading and tidying

arrests_raw <- read.csv("arrests.csv", header = TRUE)

# Renaming unnamed headers
arrests_raw <- rename(arrests_raw, race=X, age=X.1, year=X.2,employed=Employed,citizen=Citizen,count_f=Female,count_m=Male)
head(arrests_raw)
##    race age year employed citizen count_f count_m
## 1 Black  13 1997      Yes      No      NA       1
## 2        NA 1998      Yes     Yes      NA       1
## 3        NA 1999      Yes     Yes      NA       1
## 4        NA 2000      Yes     Yes      NA       1
## 5        NA 2001      Yes     Yes      NA       1
## 6        14 1997      Yes      No      NA       1
# Using na.locf to fill NA data with correct values
arrests_raw$race[arrests_raw$race == ""] <- NA
arrests_raw$race <- na.locf(arrests_raw$race)
arrests_raw$age <- na.locf(arrests_raw$age)
arrests_raw$year <- na.locf(arrests_raw$year)
head(arrests_raw)
##    race age year employed citizen count_f count_m
## 1 Black  13 1997      Yes      No      NA       1
## 2 Black  13 1998      Yes     Yes      NA       1
## 3 Black  13 1999      Yes     Yes      NA       1
## 4 Black  13 2000      Yes     Yes      NA       1
## 5 Black  13 2001      Yes     Yes      NA       1
## 6 Black  14 1997      Yes      No      NA       1
# Splitting data into male and female sets so they can be joined again as one "sex" variable
arrests_raw_female <- subset(arrests_raw,select=-count_m)
arrests_raw_female <- na.omit(arrests_raw_female)
arrests_raw_male <- subset(arrests_raw,select=-count_f)
arrests_raw_male <- na.omit(arrests_raw_male)

# Creating an observation based on the count of each record
arrests_female <- data.frame(arrests_raw_female[rep(seq_len(dim(arrests_raw_female)[1]), arrests_raw_female$count_f), , drop = FALSE], row.names=NULL)

arrests_male <- data.frame(arrests_raw_male[rep(seq_len(dim(arrests_raw_male)[1]), arrests_raw_male$count_m), , drop = FALSE], row.names=NULL)

# Removing unneeded count fields
arrests_female <- subset(arrests_female, select= -count_f)
arrests_male <- subset(arrests_male, select= -count_m)

# Adding "sex" field to each set
arrests_female$sex <- "Female"
arrests_male$sex <- "Male"

# Re-joining male and female sets, setting blank citizen status to "Unknown," setting yes/no citizen status to explicit citizen vs non-citizen
arrests_tidy <- rbind(arrests_male,arrests_female)
arrests_tidy$employed <- sub("^$","Unknown",arrests_tidy$employed)
arrests_tidy$citizen <- sub("No","Non-Citizen",arrests_tidy$citizen)
arrests_tidy$citizen <- sub("Yes","Citizen",arrests_tidy$citizen)
head(arrests_tidy)
##    race age year employed     citizen  sex
## 1 Black  13 1997      Yes Non-Citizen Male
## 2 Black  13 1998      Yes     Citizen Male
## 3 Black  13 1999      Yes     Citizen Male
## 4 Black  13 2000      Yes     Citizen Male
## 5 Black  13 2001      Yes     Citizen Male
## 6 Black  14 1997      Yes Non-Citizen Male

Transformation for analysis

# Selecting by citizen status and sex, avoiding race and age for this analysis.
arrests_analysis <- select(arrests_tidy,citizen,sex)
arrests_analysis <- group_by(arrests_analysis,citizen,sex) 
arrests_analysis <- summarize(arrests_analysis,count=n())
arrests_analysis <- group_by(arrests_analysis)
arrests_analysis <- mutate(arrests_analysis,sum=sum(count))
arrests_analysis <- data.frame(mutate(arrests_analysis, pct_total=round(count/sum*100,2)))
arrests_analysis$demographic <- paste(arrests_analysis$citizen,arrests_analysis$sex)
arrests_analysis
##       citizen    sex count  sum pct_total        demographic
## 1     Citizen Female   409 5226      7.83     Citizen Female
## 2     Citizen   Male  4046 5226     77.42       Citizen Male
## 3 Non-Citizen Female    34 5226      0.65 Non-Citizen Female
## 4 Non-Citizen   Male   737 5226     14.10   Non-Citizen Male

Data visualization

ggplot(arrests_analysis, aes(x=demographic,y = pct_total)) +
   geom_bar(width = .75,stat = "identity", position="dodge") +
   ggtitle("Percent of Toronto Marijuana Arrests per Citizen Status and Sex") +
   labs(x="Suspect Demographic",y="Percent of All Arrests") +
   theme(plot.title = element_text(hjust=0.5)) +
   scale_y_continuous(breaks = seq(0,100,by = 5))

Conclusion

Males overall have a greater number of arrests than females, but citizen male arrest numbers dwarf the other three analyzed demographic groups

Grants

This dataset, posted by myself, contains NIH awarded grant data for fiscal year 2018. The variables are institution, principal investigator, date of grant award, funding mechanism, and award amount. As the funding mechanisms are stretched across the column header in original dataset, this will have to be tidied into long format. The csv can be found here: https://github.com/hdupre/DATA607/blob/master/Project2/grants.csv

Loading and tidying

grants_raw <- read.csv("grants.csv")
grants_raw <- rename(grants_raw, institution=X, principal_investigator=X.1, date_granted=X.2)
head(grants_raw)
##                            institution principal_investigator date_granted
## 1          21ST CENTURY MEDICINE, INC.        FAHY, GREGORY M      9/17/18
## 2                                              TING, ALISON Y      9/25/18
## 3      21ST CENTURY THERAPEUTICS, INC.           SHAW, JIAJIU      9/16/18
## 4 3-C INSTITUTE FOR SOCIAL DEVELOPMENT       CHILDRESS, DEBRA     11/15/17
## 5                                                                  9/19/18
## 6                                         DEROSIER, MELISSA E       7/7/18
##   Construction Other Other.Research.Related R.D.Contracts Research.Centers
## 1           NA    NA                     NA            NA               NA
## 2           NA    NA                     NA            NA               NA
## 3           NA    NA                     NA            NA               NA
## 4           NA    NA                     NA            NA               NA
## 5           NA    NA                     NA            NA               NA
## 6           NA    NA                     NA            NA               NA
##   RPGs...Non.SBIR.STTR RPGs...SBIR.STTR Training...Individual
## 1                   NA           219632                    NA
## 2                   NA           172694                    NA
## 3                   NA           345914                    NA
## 4                   NA           209696                    NA
## 5                   NA           504984                    NA
## 6                   NA           541517                    NA
##   Training...Institutional
## 1                       NA
## 2                       NA
## 3                       NA
## 4                       NA
## 5                       NA
## 6                       NA
# Replacing blanks with NAs so that na.locf can be used to fill missing values
grants_raw$institution[grants_raw$institution == ""] <- NA
grants_raw$principal_investigator[grants_raw$principal_investigator == ""] <- NA
grants_raw$institution <- na.locf(grants_raw$institution)
grants_raw$principal_investigator <- na.locf(grants_raw$principal_investigator)

# Putting data into long format by creating variable "funding_mechanism"
grants_tidy <- gather(grants_raw, key="funding_mechanism",value="dollar_amount",4:12)
grants_tidy <- na.omit(grants_tidy)
head(grants_tidy)
##                                  institution principal_investigator
## 46866 UNIVERSITY OF PUERTO RICO MED SCIENCES    MARTINEZ, MELWEEN I
## 46875 UNIVERSITY OF PUERTO RICO MED SCIENCES       SARIOL, CARLOS A
## 57949     ASIAN HEALTH COALITION OF ILLINOIS     RANDAL, FORNESSA T
## 58350             BAYLOR COLLEGE OF MEDICINE       GIBBS, RICHARD A
## 59223                        BLACKFYNN, INC.      WAGENAAR, JOOST B
## 60860                  BROAD INSTITUTE, INC.   CLEMONS, PAUL ANDREW
##       date_granted funding_mechanism dollar_amount
## 46866      9/18/18      Construction       5789062
## 46875      9/18/18      Construction       2010938
## 57949      9/19/18             Other        444618
## 58350      9/25/18             Other       7944175
## 59223      9/22/18             Other       2395004
## 60860      3/27/18             Other        422447

Transformation for analysis

# Using groups to create two top ten tables: institution and principal investigator. Also calculate percent of total funding that was received on each funding mechanism
grants_analysis <- select(grants_tidy, institution, principal_investigator, funding_mechanism,dollar_amount)
top_ten_awards <- data.frame(head(grants_analysis[order(-grants_analysis$dollar_amount),],10))
grants_analysis <- group_by(grants_analysis, institution)
sum_institutions <- summarize(grants_analysis,award_dollars=sum(as.numeric(dollar_amount)))
top_ten_institutions <- data.frame(head(sum_institutions[order(-sum_institutions$award_dollars),],10))
grants_analysis <- group_by(grants_analysis, principal_investigator)
sum_pis <- summarize(grants_analysis,award_dollars=sum(dollar_amount))
top_ten_pis <- data.frame(head(sum_pis[order(-sum_pis$award_dollars),],11))
top_ten_pis <- top_ten_pis[-1,]
grants_analysis <- group_by(grants_analysis, funding_mechanism)
sum_mech <- summarize(grants_analysis,award_dollars=sum(as.numeric(dollar_amount)))
sum_mech <- mutate(sum_mech, total=sum(award_dollars))
sum_mech <- mutate(sum_mech, pct_total=round(award_dollars/total*100,2))
pct_funding_mech <- data.frame(select(sum_mech,funding_mechanism,pct_total))

Data visualization

top_ten_institutions
##                                institution award_dollars
## 1                 JOHNS HOPKINS UNIVERSITY     674583550
## 2  UNIVERSITY OF CALIFORNIA, SAN FRANCISCO     647880065
## 3      UNIVERSITY OF MICHIGAN AT ANN ARBOR     552433992
## 4   UNIVERSITY OF PITTSBURGH AT PITTSBURGH     536502831
## 5               UNIVERSITY OF PENNSYLVANIA     511419097
## 6                      STANFORD UNIVERSITY     505474358
## 7                    WASHINGTON UNIVERSITY     486295442
## 8                          DUKE UNIVERSITY     475338515
## 9           MASSACHUSETTS GENERAL HOSPITAL     465776958
## 10     COLUMBIA UNIVERSITY HEALTH SCIENCES     464799343
top_ten_pis
##        principal_investigator award_dollars
## 2              COHEN, MYRON S      84864416
## 3         KURITZKES, DANIEL R      60929808
## 4             COREY, LAWRENCE      56063854
## 5         TOPOL, ERIC JEFFREY      46828302
## 6  MCELRATH, MARGARET JULIANA      36796619
## 7             DRIVER, BARBARA      35915364
## 8              LACHIN, JOHN M      34541030
## 9      MITRA, GAUTAM (GEORGE)      33303879
## 10           HAYNES, BARTON F      30387129
## 11            NEPOM, GERALD T      29600000
ggplot(pct_funding_mech, aes(x=funding_mechanism,y = pct_total)) +
   geom_bar(width = .75,stat = "identity", position="dodge") +
   ggtitle("Percent of Total NIH Funding by Funding Mechanism (Fiscal Year 2018)") +
   labs(x="Funding Mechanism",y="Percent of Total") +
   theme(plot.title = element_text(hjust=0.5),axis.text.x = element_text(angle=45,hjust=1)) +
   scale_y_continuous(breaks = seq(0,100,by = 5))

Conclusion

Johns Hopkins University and UCSF were granted the greatest number of award dollars as institutions in fiscal year 2018. Myron S Cohen was the most successful individual in fiscal year 2018. The RPG Non-SBIR STTR funding mechanism is by far the most common funding mechanism for awards.

Emmys

The final dataset is a comprehensive list of the main Emmy awards: outstanding comedy, drama and variety show, and the lead actors and actresses in comedy and drama shows for each year. This dataset was posted by Saratchandra Palle, and requires transforming to long format, as well as some string extractions to create a “network” column as each show and actor is connected to a television network. The csv can be found here: https://github.com/hdupre/DATA607/blob/master/Project2/emmys.csv

Loading and tidying

emmys_raw <- read.csv("emmys.csv")
emmys_raw <- rename(emmys_raw, year=Year,Outstanding.Comedy=Comedy,Outstanding.Drama=Drama,Outstanding.Variety=Variety)

Certain irregularities in the data necessitate the removal of certain years, or certain awards from certain years. 1949,1950, and 1965 had different formats from the rest of the Emmys, such as one overall winner for all categories or numerous winners in each category.
In some years, no awards was given for a specific award type, and so the observation was removed.

emmys_raw <- emmys_raw[emmys_raw$year!=1949 & emmys_raw$year!=1950 & emmys_raw$year!=1965,]

emmys_tidy <- gather(emmys_raw,"award","winner",2:8)
## Warning: attributes are not identical across measure variables;
## they will be dropped
emmys_tidy <- emmys_tidy[-c(72,73,77,143),]

# Extracting network name from between parenthesis, then deleting parenthesis
network <- str_extract_all(emmys_tidy$winner,"\\(([^()]+)\\)")
network <- substring(network,2,nchar(network)-1)

# Certain irregularities with network names necessitated some manual adjustment
network[c(86,88,89,91,424,425,428)] <- "PBS"
network[c(271,407)] <- "no channel listed"
network[74] <- "CBS"
network[76] <- "NBC"

emmys_tidy$network <- network
head(emmys_tidy)
##   year              award                          winner network
## 1 1951 Outstanding.Comedy Pulitzer Prize Playhouse  (ABC)     ABC
## 2 1952 Outstanding.Comedy     The Red Skelton Show  (CBS)     CBS
## 3 1953 Outstanding.Comedy              I Love Lucy  (CBS)     CBS
## 4 1954 Outstanding.Comedy              I Love Lucy  (CBS)     CBS
## 5 1955 Outstanding.Comedy      Make Room for Daddy  (ABC)     ABC
## 6 1956 Outstanding.Comedy    The Phil Silvers Show  (CBS)     CBS

Transformation for analysis

emmys_analysis <- emmys_tidy
emmys_analysis <- group_by(emmys_analysis, network,award)
network_awards <- data.frame(summarize(emmys_analysis, count=n()))
network_awards <- network_awards[-c(51,52,59),]
emmys_analysis <- group_by(emmys_analysis, year,network)
awards_per_year <- summarize(emmys_analysis, per_network=n())
awards_per_year <- mutate(awards_per_year, total=sum(per_network))
awards_per_year <- mutate(awards_per_year, pct_yearly_total= round(per_network/total*100,2))
awards_per_year <- filter(awards_per_year, network=="ABC"|network=="CBS"|network=="NBC"|network=="Amazon")
awards_per_year <- data.frame(select(awards_per_year,year,network,pct_yearly_total))
head(awards_per_year)
##   year network pct_yearly_total
## 1 1951     ABC            28.57
## 2 1951     CBS            71.43
## 3 1952     CBS            28.57
## 4 1952     NBC            71.43
## 5 1953     CBS            28.57
## 6 1953     NBC            42.86

Data visualization

The analyses here will explore the total (major) awards per network that have been achieved since the inception of the Emmys, and the percent of all awards for a particular year won by the big three networks (ABC, NBC, CBS) versus the newcomer, Amazon.

ggplot(network_awards, aes(x=network,y = count,fill=award)) +
   geom_bar(width = .75,stat = "identity", position="dodge") +
   ggtitle("Major Emmy Awards Per Network, 1951-2019") +
   labs(x="Network",y="Number of Awards", fill= "Award") +
   theme(plot.title = element_text(hjust=0.5),axis.text.x = element_text(angle=90,hjust=1)) +
   scale_y_continuous(breaks = seq(0,35,by = 5))

ggplot(awards_per_year, aes(x=year,y = pct_yearly_total,color=network)) +
   geom_smooth(se=F) +
   ggtitle("Yearly Percent of Total Awards Per Network") +
   labs(x="Year",y="Percent of Yearly Total", fill= "Network") +
   theme(plot.title = element_text(hjust=0.5),axis.text.x = element_text(angle=90,hjust=1)) +
   scale_x_continuous(breaks = seq(1950,2020,by = 5))

Conclusion

CBS and NBC lead in overall major Emmy’s but they have a major lead as they were in the running from the beginning when there was much less competition. However, the yearly percent plot shows that their percentage of major awards is decreasing with time while upstart competitors like Amazon are gaining ground.