install packages
library(RCurl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readr)
library(ggplot2)
library(broom)
library(ggpubr)
import data
tsa <- read.csv("https://raw.githubusercontent.com/arinolan/607-final/main/tsa%20data.csv")
recall <- read.csv("https://raw.githubusercontent.com/arinolan/607-final/main/recalls.csv")
filter recall data to 2022 and take subset of columns
newRecall <- subset(recall, select = -c(url, establishment_id, establishment_slug, establishment_address, establishment_telephone),
start_date > '2021-12-31')
update date formate
tsa$Date <- trimws(gsub('/', '-', tsa$Date, fixed=T))
tsa$Date <- as.Date((tsa$Date), format = "%d")
combine tsa and recall data into one table; rename X2022 column to TSA
fullData <- merge(tsa, newRecall)
names(fullData)[names(fullData) == 'X2022'] <- 'TSA'
fullData$TSA <- trimws(gsub(',', '', fullData$TSA, fixed=T))
convert column types
fullData$TSA <- as.numeric(fullData$TSA)
summary of combined data
summary(fullData)
## Date TSA start_date end_date
## Min. :2022-12-01 Min. :1059741 Length:21112 Length:21112
## 1st Qu.:2022-12-04 1st Qu.:1894331 Class :character Class :character
## Median :2022-12-07 Median :2128375 Mode :character Mode :character
## Mean :2022-12-06 Mean :2064043
## 3rd Qu.:2022-12-09 3rd Qu.:2308212
## Max. :2022-12-12 Max. :2494757
## NA's :2436
## id title reasons status
## Length:21112 Length:21112 Length:21112 Length:21112
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## risk_level establishment_name establishment_grant_date
## Length:21112 Length:21112 Length:21112
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## establishment_activities quantity_recovered quantity_unit
## Length:21112 Min. : 0 Length:21112
## Class :character 1st Qu.: 457 Class :character
## Mode :character Median : 2070 Mode :character
## Mean : 8845
## 3rd Qu.: 5864
## Max. :53909
## NA's :14560
## states
## Length:21112
## Class :character
## Mode :character
##
##
##
##
summary(fullData$TSA)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1059741 1894331 2128375 2064043 2308212 2494757 2436
summary(fullData$quantity_recovered)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 457 2070 8845 5864 53909 14560
cor(fullData$TSA, fullData$quantity_recovered)
## [1] NA
check if TSA has normal distribution and create plot
hist(fullData$TSA)
plot(TSA ~ quantity_recovered, data=fullData)
linear regression between tsa #s and quantity recovered (calling it relationship)
relationship.lm <- lm(TSA ~ quantity_recovered, data=fullData)
summary(relationship.lm)
##
## Call:
## lm(formula = TSA ~ quantity_recovered, data = fullData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1004302 -169712 64332 244169 430714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.064e+06 4.600e+03 448.7 <2e-16 ***
## quantity_recovered 5.218e-15 2.459e-01 0.0 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 308600 on 5794 degrees of freedom
## (15316 observations deleted due to missingness)
## Multiple R-squared: 1.628e-30, Adjusted R-squared: -0.0001726
## F-statistic: 9.431e-27 on 1 and 5794 DF, p-value: 1
par(mfrow=c(2,2))
plot(relationship.lm)
par(mfrow=c(1,1))
horrible R2 so no correlation with TSA and quantity recovered
multiple regression to include state as factor (calling it multiple)
in conclusion, there is no correlation between TSA numbers and the amount of quantities recovered from food recalls. I think what would’ve worked better is if i merged the dataframes by the date but when i attempted this originally, the output was an empty table. If I were able to merge by the date, i would have a better and more accurate idea as to my research question. I wanted to attempt screenscraping but I didn’t have any luck - this is an area for improvement for me. based one the charts above, these would support my conclusion that there isn’t a correlation between the two different datasets. I used a linear regression model as well as created plots to visualize the standard residuals for the lienar regression model (didn’t cover this in class). when looking at the standard residual graphs, the goal is to have the red line straight a 0 and a straight line for the theoretical quantities. Even though this analysis doesn’t really benefit me int he real world, it was fun to compare unlikely correlated datasets while using new techniques that i learned throughout this semester.