CAERS Adverse Foods Reactions Database

In this document we perform an Exploratory Data Analysis of the Adverse Food dataset available from Kaggle.com. This dataset is taken from the CFSAN Adverse Event Reporting System (CAERS) database collected by the US Food and Drug Administration. This dataset represents reports taken from 2004 until mid 2017.

Packages and Useful Functions

For this analysis we will only need the tidyverse and lubridate packages. We will also make use of the multiplot function available on R-Cookbooks.

suppressMessages(library(tidyverse))
suppressMessages(library(lubridate))

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

We start by loading the dataset.

df <- read_csv('datasets/CAERS_ASCII_2004_2017Q2.csv')

## Parsed with column specification:
## cols(
##   `RA_Report #` = col_integer(),
##   `RA_CAERS Created Date` = col_character(),
##   `AEC_Event Start Date` = col_character(),
##   `PRI_Product Role` = col_character(),
##   `PRI_Reported Brand/Product Name` = col_character(),
##   `PRI_FDA Industry Code` = col_integer(),
##   `PRI_FDA Industry Name` = col_character(),
##   `CI_Age at Adverse Event` = col_integer(),
##   `CI_Age Unit` = col_character(),
##   CI_Gender = col_character(),
##   `AEC_One Row Outcomes` = col_character(),
##   `SYM_One Row Coded Symptoms` = col_character()
## )

df

First, this dataset has some difficult to use feature names so we begin by changing the names to something more useable.

#Renaming features to meet our standards
df <- plyr::rename(df, c('RA_Report #'='ReportNum','RA_CAERS Created Date'='CreatedDate', 'AEC_Event Start Date'='StartDate', 'PRI_Product Role'='ProductRole', 'PRI_Reported Brand/Product Name'='Product', 'PRI_FDA Industry Code'='IndustryCode','PRI_FDA Industry Name'='IndustryName', 'CI_Age at Adverse Event'='Age', 'CI_Age Unit'='AgeUnit','CI_Gender'='Gender', 'AEC_One Row Outcomes'='Outcome', 'SYM_One Row Coded Symptoms'='Symptoms' ))
names(df)

##  [1] "ReportNum"    "CreatedDate"  "StartDate"    "ProductRole" 
##  [5] "Product"      "IndustryCode" "IndustryName" "Age"         
##  [9] "AgeUnit"      "Gender"       "Outcome"      "Symptoms"

Now we can get started with some simple data visualizations.

Ten Most Common Industries

Taking a look at the most common Industries to be associated with Adverse Events within this dataset.

BarPlotIndustry <- function(data = df , n = 10){
  data %>%
    group_by(IndustryName) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count)) %>%
    ungroup() %>%
    mutate(IndustryName = reorder(IndustryName, Count))%>%
    head(n) %>%
    
    ggplot(aes(x = IndustryName, y = Count, fill = IndustryName)) +
    geom_bar(stat = 'identity',  color = 'white') +
    geom_text(aes( x = IndustryName, y = 1, label = paste0("(",Count,")", sep = '')), hjust = -.7, vjust = .5, size = 4, colour= 'black') + 
    coord_flip() +
    theme_classic() +
    labs(x = 'Industry', y = 'Count', title = paste('Top ',n,' Affected Industries', sep = '')) +
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 10, simplify = FALSE), paste, collapse = '\n')) +
    theme(legend.position = 'none')
}
BarPlotIndustry()

Vitamins and Minerals make up the largest portion of Industries affected by a large margin followed by Cosmetics, then Nuts and Edible Seeds third.

Ten Most Common Products

Lets take a similar look at the most common products reported.

# Create a Product-Industry lookup table
prod_industry_map <- unique(df[,c(7,5)])

BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count, fill = IndustryName)) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8)) 
}
BarPlotProducts()

## Joining, by = "Product"

Here we see that the top 10 is dominated by products in the Vitamin/ Minerals Industry.

Ten Most common Symptoms

Next we look at the most common Symptoms in this dataset followed by the most common outomes.

BarPlotSymptoms = function(data = df, n = 10)
{
Symptoms = str_split(data$Symptoms,',') 

AllSymptoms <- data.frame(matrix(unlist(Symptoms),byrow=T),stringsAsFactors=FALSE)

colnames(AllSymptoms) = c("SymptomName")

#trimws returns a character string with leading and/or trailing whitespaces removed.
AllSymptoms$SymptomName = trimws(AllSymptoms$SymptomName)

AllSymptoms %>%
  group_by(SymptomName) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(SymptomName = reorder(SymptomName,Count)) %>%
  head(n) %>%
  
  ggplot(aes(x = SymptomName,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = 'salmon') +
  geom_text(aes(x = SymptomName, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
  labs(x = 'Symptom', 
       y = 'Count', 
       title = paste('Top ', n ,' Symptoms', sep = '')) +
  coord_flip() + 
  theme_classic() +
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n'))
}

BarPlotSymptoms()

Most Common Outcomes

BarPlotOutcomes <- function(data = df, n = 10) {
  Outcomes <- str_split(data$Outcome, ',')
  
  AllOutcomes <- data.frame(matrix(unlist(Outcomes),byrow=T),stringsAsFactors=FALSE)

colnames(AllOutcomes) = c("Outcome")

AllOutcomes$Outcome <- trimws(AllOutcomes$Outcome)
  
  AllOutcomes %>%
    group_by(Outcome) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count)) %>%
    ungroup() %>%
    mutate(Outcome = reorder(Outcome, Count)) %>%
    head(n) %>%
    
    ggplot(aes(x = Outcome, y = Count)) +
    geom_bar(stat = 'identity', fill = 'salmon', color = 'white') +
    geom_text(aes( x = Outcome, y = 1, label = paste0("(",Count,")", sep = '')), hjust = 0, vjust = .5, size = 4, colour= 'black') + 
    coord_flip() +
    theme_classic() +
    labs(x = 'Outcome', y = 'Count', title = paste('Top ', n,' Outcomes', sep = '')) +
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n'))
}
BarPlotOutcomes(df)

Here we see the usual assortment of symptoms associated with ingestion. Nausea, Vomitting, and Diarrhoea are the three most common symptoms with most events resulting in visiting a health care provider, hospital or ER. The leading outcome is listed as Other. Life Threatening reactions, disability and even death are all outcomes that appear on the top 10 most common outcomes. We will explore the causes of these three outcomes later.

Age Distribution of Affected Persons

Now lets move on to look at the age distribution represented in this data. Currently the age of patients is listed in one of 5 ways. In decades, years, months, weeks, or days. So first we convert all observations into years and check for outlandish values.

df$Age <- as.double(df$Age)
df <- df %>%
  mutate(AgeatTimeofEvent = 
           if_else(df$AgeUnit == 'Decade(s)', Age*10 , 
                   if_else(df$AgeUnit == 'Month(s)', Age/12, 
                           if_else(df$AgeUnit == 'Week(s)', Age/52, 
                                   if_else(df$AgeUnit == 'Day(s)', Age/365, Age)))))
# Ensuring age modification worked properly
df[df$AgeUnit == 'Decade(s)', c(8,9,ncol(df))]

# First value appears to be 76 years but listed as decades
df[which(df$AgeatTimeofEvent == 760), ncol(df)] <- 76

# Check on Days
df[df$AgeUnit == 'Day(s)', c(8,9,ncol(df))] %>%
  arrange(desc(AgeatTimeofEvent))

# Check on Months
df[df$AgeUnit == 'Month(s)', c(8,9,ncol(df))] %>%
  arrange(desc(AgeatTimeofEvent))

# Check on Weeks
df[df$AgeUnit == 'Week(s)', c(8,9,ncol(df))] %>%
  arrange(desc(AgeatTimeofEvent))

# Check on Years
df[df$AgeUnit == 'Year(s)', c(8,9,ncol(df))] %>%
  arrange(desc(AgeatTimeofEvent))

# Several of these values are oddly high.  Lets remove values where the AgeatTimeofEvent is greater than 122 years which is the record for the oldest person on record
df[which(df$AgeatTimeofEvent > 122), ncol(df)] <-NA



summary(df$AgeatTimeofEvent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   34.00   53.00   50.37   68.00  115.00   37866

This leaves us with a high number of missing values but otherwise looks ok.

# Create new data frame with NA Ages removed
dfAge <- df[!is.na(df$Age) ,]

# Visualizing the ages of thosde affected
ggplot(dfAge, aes(x = AgeatTimeofEvent, colour = '1',fill = '1')) +
  geom_density() +
  labs(x = 'Age', y = 'Density', title = 'Age Density Plot') +
  theme_classic() +
  theme(legend.position = 'none') +
  scale_x_continuous(breaks = seq(0, 120, 10))

## Warning: Removed 6 rows containing non-finite values (stat_density).

It appears that those between the ages of 55 and 65 make up the largest proportion of this dataset. The dataset appears bimodal with a secondary peak under the age of 5.

This would follow with intuition that the young and the elderly are most likely to experince adverse effects of foods.

Next let’s look at the most common Industries, Products, Symptoms, and Outcomes broken down by age groups.

By age groups

Age groups will be those under 10, 10-18, 19-24, 25-50, 50-70, 70+

df$AgeGroup <- ifelse(df$AgeatTimeofEvent <= 10, 'Child',
                      ifelse(df$AgeatTimeofEvent > 10 & df$AgeatTimeofEvent <=18, 'Teen',
                             ifelse(df$AgeatTimeofEvent > 18 & df$AgeatTimeofEvent <= 24, 'Young Adult',
                                    ifelse(df$AgeatTimeofEvent > 24 & df$AgeatTimeofEvent <= 50, 'Adult',
                                           ifelse(df$AgeatTimeofEvent > 50 & df$AgeatTimeofEvent <= 70, 'Senior',
                                                  ifelse(df$AgeatTimeofEvent > 70, 'Elder', NA))))))

table(df$AgeGroup)

## 
##       Adult       Child       Elder      Senior        Teen Young Adult 
##       16593        4061       11295       17330        1655        1986

Children < 10 years

PlotChild <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Child')
  p1 <- BarPlotIndustry(PlotDat, 4)

BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotChild(df)

## Joining, by = "Product"

In young children, most issues were found within the Baby Food Industry though the most common product in this age group is Peter Pan Peanut Butter. Vomiting and diarrhoea are the most common symptoms resulting in mostly non-serious injuries/illness.

Teens 11-18

PlotTeen <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Teen')
  dim(PlotDat)
  p1 <- BarPlotIndustry(data = PlotDat, n = 4)
  
BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotTeen(df)

## Joining, by = "Product"

In the “Teen” age group, the Vitamin/Mineral industry results in by far the largest industry affected. Peter Pan peanut butter maintains the top two spots in the most common products. Vomiting and Non-Serious injury/illness are still the most common symptom and outcome respectively.

Young Adults 19-25

PlotTeen <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Young Adult')
  dim(PlotDat)
  p1 <- BarPlotIndustry(data = PlotDat, n = 4)
  
BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotTeen(df)

## Joining, by = "Product"

In Young Adults, we see variations of Hydroxycut as the top 3 products. Affected Industries and Symptoms are largely the same as the “Teen” age group though the most common outcomes are more serious with Hospitilization and Other Serious medical events being the two most common outcomes.

Adults 26-50

PlotTeen <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Adult')
  dim(PlotDat)
  p1 <- BarPlotIndustry(data = PlotDat, n = 4)
  
BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotTeen(df)

## Joining, by = "Product"

This age group is very similar to the Young Adult age group with the exception of the top 4 products. Hydroxycut Regular Rapid Release Caplets remains though the top four is now rounded out with Multivitamins, Vitamin D, and Wen Cleansing Conditioner.

Seniors 51-70

PlotTeen <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Senior')
  dim(PlotDat)
  p1 <- BarPlotIndustry(data = PlotDat, n = 4)
  
BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotTeen(df)

## Joining, by = "Product"

This age group is largely similar to the Adults gorup with the addition of Alopecia listed as one of the more common Symptoms.

Elderly 70+

PlotTeen <- function(data = df){
  PlotDat <- subset(df, AgeGroup == 'Elder')
  dim(PlotDat)
  p1 <- BarPlotIndustry(data = PlotDat, n = 4)
  
BarPlotProducts <- function(data = df, n = 10) {
  data %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    inner_join(prod_industry_map) %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count,  fill = 'salmon')) +
    geom_bar(stat = 'identity', color = 'white') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8), legend.position = 'none') 
}
  p2 <- BarPlotProducts(PlotDat, 4)
  
  p3 <- BarPlotSymptoms(PlotDat, 4)
  
  p4 <- BarPlotOutcomes(PlotDat, 4)
  
  multiplot(p1, p2, p3, p4, cols = 2)
  
}

PlotTeen(df)

## Joining, by = "Product"

Centrum Silver becomes a leading Product in the Elder age group. Elders also see Dysphagia (difficulty swallowing) and Dyspnoea (difficulty breathing) as two of their top 4 symptoms.

Dangerous Products

Now let’s take a look at what products cause some of the most serious outcomes. Here we limit these outcomes to: * Death * Disability * Life Threatening

First let’s look at what age

PlotSeriousOutcomeAges <- function(data = df) {
  PlotDat <- filter(df, str_detect(tolower(df$Outcome), 'death' ) | str_detect(tolower(df$Outcome), 'disability') | str_detect(tolower(df$Outcome), 'life threatening'))
  ggplot(PlotDat, aes(x = Outcome, y = Age)) +
    geom_point(position = 'jitter', alpha = .5) +
    xlim('DEATH', 'DISABILITY','LIFE THREATENING') +
    coord_cartesian(ylim = c(0,120)) +
    labs(x = 'Outcomes', title = 'Age Distribution by Serious Outcomes') + 
    theme_classic()
}
PlotSeriousOutcomeAges(df)

## Warning: Removed 7380 rows containing missing values (geom_point).

Here we see that death and life threatening are outcomes that are fairly uniformly distributed across all ages. Disability however seems to be more concentrated between 50 and 75.

Death

PlotDeathProducts <- function(data = df, n = 10) {
  PlotDat <- filter(df ,  str_detect(tolower(df$Outcome), 'death'))
  dim(PlotDat)
  BarPlotProducts(PlotDat)
} 
PlotDeathProducts(df)

## Joining, by = "Product"

It looks like Raw Oysters unfortunatly result in the most Deaths with oysters not designated as raw also making it into the top ten. It should be noted that if combined ‘5-HOUR ENERGY’ and ‘5 HOUR ENERGY’ result in the second most deaths.

Disability

PlotDeathProducts <- function(data = df, n = 10) {
  PlotDat <- filter(df ,  str_detect(tolower(df$Outcome), 'disability'))
  dim(PlotDat)
  BarPlotProducts(PlotDat)
} 
PlotDeathProducts(df)

## Joining, by = "Product"

The causes of disabilities are dominated by various vitamins and supplements with Gluten Free Cheerios, Wen Cleansing Conditioner and Chobani Yogurt also making an appearance in the top 10 causes of disabilities.

Life Threatening

PlotDeathProducts <- function(data = df, n = 10) {
  PlotDat <- filter(df ,  str_detect(tolower(df$Outcome), 'threatening'))
  dim(PlotDat)
  BarPlotProducts(PlotDat)
} 
PlotDeathProducts(df)

## Joining, by = "Product"

Causes of Life Threatening cases seems to be a mixture of the results from death and disabilities. Raw Oysters once again lead the way with the rest of the top 10 filled with vitamins and supplements. This time Hydroxycut makes the list at number 7 with 38 occurences.

Events over Time

Next lets take a look at how adverse events are curated over time.

df$Year <- year(dmy(df$StartDate))

## Warning: 29962 failed to parse.

PlotTimeLine <- function(data = df) {
  data %>%
    filter(Year >= 2000) %>%
    group_by(Year) %>%
    summarise(Count = n()) %>%
    arrange(Year) %>%
    mutate(Year = reorder(Year, Count)) %>%
    ggplot(aes(x = Year, y = Count, group = 1)) +
    geom_line(stat= 'identity') +
    geom_point() +
    theme_bw() +
    labs(x = 'Year', y = 'Count', title = 'Occurences Over Time')
}
PlotTimeLine(df)

Since the year 2000, reported events has been on the increase. This is probably due to better record keeping/reporting and hightened awareness of medical practices.

Death on the weekends

There have been many reports about the so-called “Weekend Effect” in hospitals. This states that people are roughly 15% more likely to die in a hospital on the weekend. Lets check to see if this holds true in our dataset

  df$weekday <- wday(dmy(df$StartDate), label = TRUE)

## Warning: 29962 failed to parse.

PlotWeekendDeath <- function(data = df) {

  data %>%
    filter(!is.na(weekday)) %>%
    group_by(weekday) %>%
    summarise(Count = n()) %>%
    
    ggplot(aes(x = as.factor(weekday), y = Count)) +
    geom_bar(stat = 'identity', fill = 'salmon') +
    theme_classic() +
    labs(x = 'Day of the Week', title = 'Deaths by the Day of the Week')
}
PlotWeekendDeath()

Looks as though the “Weekend Effect” is only a myth! Or at least it is not represented in this dataset.

The Effect of Gender

Lastly let’s take a quick look at Gender and how it may impact our results

Overall Reporting

PlotGender <- function(data = df) {
  ggplot(data , aes(Gender)) + 
  geom_bar(color = 'white', fill = 'salmon') +
  labs(x = 'Gender', y = 'Count', title = 'Reported Genders') +
  theme_classic()
}
PlotGender()

In this dataset, the ratio of Female to Male is over 2:1. Let’s break down some of our previous reports by gender.

Industries

BarPlotIndustryGender <- function(data = df , n = 10){
  p1 <-data %>%
    filter(Gender == 'Male') %>%
    group_by(IndustryName) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count)) %>%
    ungroup() %>%
    mutate(IndustryName = reorder(IndustryName, Count))%>%
    head(n) %>%
    
    ggplot(aes(x = IndustryName, y = Count)) +
    geom_bar(stat = 'identity', fill = 'salmon', color = 'white') +
    geom_text(aes( x = IndustryName, y = 1, label = paste0("(",Count,")", sep = '')), hjust = -.7, vjust = .5, size = 4, colour= 'black') + 
    coord_flip() +
    theme_classic() +
    labs(x = 'Industry', y = 'Count', title = paste('Top ',n,' Male Industries', sep = '')) +
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 10, simplify = FALSE), paste, collapse = '\n'))
  
  p2 <-data %>%
    filter(Gender == 'Female') %>%
    group_by(IndustryName) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count)) %>%
    ungroup() %>%
    mutate(IndustryName = reorder(IndustryName, Count))%>%
    head(n) %>%
    
    ggplot(aes(x = IndustryName, y = Count)) +
    geom_bar(stat = 'identity', fill = 'salmon', color = 'white') +
    geom_text(aes( x = IndustryName, y = 1, label = paste0("(",Count,")", sep = '')), hjust = -.7, vjust = .5, size = 4, colour= 'black') + 
    coord_flip() +
    theme_classic() +
    labs(x = 'Industry', y = 'Count', title = paste('Top ',n,' Female Industries', sep = '')) +
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 10, simplify = FALSE), paste, collapse = '\n'))
  
  multiplot(p1, p2, cols = 2)
}
BarPlotIndustryGender(n = 4)

As expected, women are much more likely to have issues with Cosmetics than men are. It appears that men are more likely to have issues with products from he Fishery/Seafood Prod industry.

Products

BarPlotProductsGender <- function(data = df, n = 10) {
  p1 <-data %>%
    filter(Gender == 'Male') %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count)) +
    geom_bar(stat = 'identity', color = 'white', fill = 'salmon') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Male Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8))
  
  p2 <-data %>%
    filter(Gender == 'Female') %>%
    group_by(Product) %>%
    summarise(Count = n()) %>%
    arrange(desc(Count))%>%
    ungroup() %>%
    mutate(Product = reorder(Product, Count)) %>%
    filter(Product !='REDACTED') %>% #Largest group is redacted so we remove that class
    head(n) %>%
    
    ggplot(aes(x = Product, y = Count)) +
    geom_bar(stat = 'identity', color = 'white', fill = 'salmon') +
    coord_flip() +
    geom_text(aes(x = Product, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black') +
    theme_classic() +
    labs(x = 'Product', y = 'Count', title = paste('Top ',n,' Female Products', sep = ''))+
    scale_x_discrete(labels = function(x) lapply(strwrap(x, width = 20, simplify = FALSE), paste, collapse = '\n')) +
    theme(axis.text.y  = element_text(size = 8))
  
  multiplot(p1, p2, cols = 2)
}
BarPlotProductsGender(n = 4)

All four top produts for women are vitamins. Men have issues with Super Beta Prostate vitamins most often. Of course this is a supplement we would only expect men to take. Men also have issues with raw oysters and Hydroxycut Rapid-Release Caplets(a weight loss supplement).

The Elderly and Death

At this point, we have explored data in many ways, and one prominent questions comes to mind: Are the elderly more likely to experiece serious symptoms than non-elders? Let’s start by taking a look at the difference between the two groups.

df$elderly <- factor(ifelse(df$AgeatTimeofEvent == 'Elder' ,'Elderly' , 'Adult'), levels = c('Elderly', 'Adult'))

seriousconditions <- c("OTHER SERIOUS (IMPORTANT MEDICAL EVENTS)", 'SERIOUS INJURIES/ ILLNESS' , 'LIFE THREATENING', 'DISABILITY', 'DEATH')

df$Serious <- factor(ifelse(df$Outcome %in% seriousconditions, 'Serious', 'Non-Serious'),levels = c('Serious', 'Non-Serious'))

To keep this test consistent, we limit our data to only those who are over 18.

elderlydata <- df %>% filter(Age > 18)

elderlydata %>% group_by(elderly,Serious) %>% summarise(N = n()) %>% summarise(Prob_serious = N[1]/sum(N)) %>% summarise(diff_prob = diff(Prob_serious))

In this data, those 70 and above are observed to have a 35 percentagepoint higher chance of being inflicted with a serious outcome. This seems quite high but to confirm we need to do some proper hypothesis testing to determine if this is statistically significant or simply due to variation within the data.

Hypothesis Testing

The Null hypothesis is that there is no difference in the likelihood of serious outcomes in Adults and in Elders. To determine if our observed difference of 35 percentage points is statistically significant, we need to calculate the null-distribution of the data.

THe null-distribution shows how the natural variance of the distribution is under the assumption that there is no link between being an elder and a serious outcome. To do this we break any connection between being an elder and serious outcomes by randomizing the Serious feature and re-calculating how far apart the Elder and Adult groups are. Since the data is randomized, what we get is a measurement of the natural variance in the data. We will do this repeatedly 1000 times. The resulting null-distribution will allow us to determine a p-value for our observed 35%.

For ease of use we will using the rep_sample_n function from the oilabs package. Since tis package is not yet available on CRAN, we will build the function manually. More information on the oilabs package can be found here: https://www.github.com/OpenIntroOrg/oilabs-r-package

rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1)
{
    n <- nrow(tbl)
    i <- unlist(replicate(reps, sample.int(n, size, replace = replace), simplify = FALSE))

    rep_tbl <- cbind(replicate = rep(1:reps,rep(size,reps)), tbl[i,])

    dplyr::group_by(rep_tbl, replicate)
}

set.seed(42)
eld_perm <- elderlydata %>%
  rep_sample_n( size = nrow(elderlydata) , reps = 1000) %>%
  mutate(ser_perm = sample(Serious)) %>%
  group_by(replicate, elderly) %>%
  summarize(prop_ser_perm = mean(ser_perm == 'Serious'),
            prop_ser = mean(Serious == 'Serious')) %>%
  summarize(diff_perm = diff(prop_ser_perm), 
           diff_orig = diff(prop_ser))

ggplot(eld_perm, aes(x = diff_perm)) +
  geom_density(bw = .1, trim = TRUE) +
  geom_vline(aes(xintercept = diff_orig) , col = 'red') +
  labs(title = 'H-Null Distribution vs Observed Value')

This is a very interesting normal distribution. Our observation (represented by the red line) does fall within some part of the null distribution but to determine statistical significance, we calculate the p-value.

z = (mean(eld_perm$diff_orig) - mean(eld_perm$diff_perm))/sd(eld_perm$diff_perm)
pv <- pnorm(-abs(z))
pv

## [1] 0.03808928

This gives us a p-value of 0.0381 which is smaller than the generally accepted limit of .05 which means that those who are 70+ years old are statistically more likely to incur a serious outcome than other adults within this data. It should be noted however that this distribution does not pass the two-tailed t-test with a p-value of 0.0762.

CAERS Adverse Food Effects EDA