install packages

library(RCurl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ✔ readr   2.1.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
library(readr)
library(ggplot2)
library(broom)
library(ggpubr)

import data

tsa <- read.csv("https://raw.githubusercontent.com/arinolan/607-final/main/tsa%20data.csv")

recall <- read.csv("https://raw.githubusercontent.com/arinolan/607-final/main/recalls.csv")

filter recall data to 2022 and take subset of columns

newRecall <- subset(recall, select = -c(url, establishment_id, establishment_slug, establishment_address, establishment_telephone), 
                    start_date > '2021-12-31')

update date formate

tsa$Date <- trimws(gsub('/', '-', tsa$Date, fixed=T))

tsa$Date <- as.Date((tsa$Date), format = "%d")

combine tsa and recall data into one table; rename X2022 column to TSA

fullData <- merge(tsa, newRecall)

names(fullData)[names(fullData) == 'X2022'] <- 'TSA'

fullData$TSA <- trimws(gsub(',', '', fullData$TSA, fixed=T))

convert column types

fullData$TSA <- as.numeric(fullData$TSA)

summary of combined data

summary(fullData)
##       Date                 TSA           start_date          end_date        
##  Min.   :2022-12-01   Min.   :1059741   Length:21112       Length:21112      
##  1st Qu.:2022-12-04   1st Qu.:1894331   Class :character   Class :character  
##  Median :2022-12-07   Median :2128375   Mode  :character   Mode  :character  
##  Mean   :2022-12-06   Mean   :2064043                                        
##  3rd Qu.:2022-12-09   3rd Qu.:2308212                                        
##  Max.   :2022-12-12   Max.   :2494757                                        
##                       NA's   :2436                                           
##       id               title             reasons             status         
##  Length:21112       Length:21112       Length:21112       Length:21112      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   risk_level        establishment_name establishment_grant_date
##  Length:21112       Length:21112       Length:21112            
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##                                                                
##  establishment_activities quantity_recovered quantity_unit     
##  Length:21112             Min.   :    0      Length:21112      
##  Class :character         1st Qu.:  457      Class :character  
##  Mode  :character         Median : 2070      Mode  :character  
##                           Mean   : 8845                        
##                           3rd Qu.: 5864                        
##                           Max.   :53909                        
##                           NA's   :14560                        
##     states         
##  Length:21112      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
summary(fullData$TSA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 1059741 1894331 2128375 2064043 2308212 2494757    2436
summary(fullData$quantity_recovered)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0     457    2070    8845    5864   53909   14560
cor(fullData$TSA, fullData$quantity_recovered)
## [1] NA

check if TSA has normal distribution and create plot

hist(fullData$TSA)

plot(TSA ~  quantity_recovered, data=fullData)

linear regression between tsa #s and quantity recovered (calling it relationship)

relationship.lm <- lm(TSA ~ quantity_recovered, data=fullData)

summary(relationship.lm)
## 
## Call:
## lm(formula = TSA ~ quantity_recovered, data = fullData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1004302  -169712    64332   244169   430714 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.064e+06  4.600e+03   448.7   <2e-16 ***
## quantity_recovered 5.218e-15  2.459e-01     0.0        1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 308600 on 5794 degrees of freedom
##   (15316 observations deleted due to missingness)
## Multiple R-squared:  1.628e-30,  Adjusted R-squared:  -0.0001726 
## F-statistic: 9.431e-27 on 1 and 5794 DF,  p-value: 1
par(mfrow=c(2,2))
plot(relationship.lm)

par(mfrow=c(1,1))

horrible R2 so no correlation with TSA and quantity recovered

multiple regression to include state as factor (calling it multiple)

in conclusion, there is no correlation between TSA numbers and the amount of quantities recovered from food recalls. I think what would’ve worked better is if i merged the dataframes by the date but when i attempted this originally, the output was an empty table. If I were able to merge by the date, i would have a better and more accurate idea as to my research question. I wanted to attempt screenscraping but I didn’t have any luck - this is an area for improvement for me. based one the charts above, these would support my conclusion that there isn’t a correlation between the two different datasets. I used a linear regression model as well as created plots to visualize the standard residuals for the lienar regression model (didn’t cover this in class). when looking at the standard residual graphs, the goal is to have the red line straight a 0 and a straight line for the theoretical quantities. Even though this analysis doesn’t really benefit me int he real world, it was fun to compare unlikely correlated datasets while using new techniques that i learned throughout this semester.