library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(readxl)
Diabetes_Data_Clean <- read_excel("C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes/Diabetes_Data_Clean.xlsx")
View(Diabetes_Data_Clean)
rm(list = ls())
library(readxl)
library(dplyr)
data_path <- "C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes/Diabetes_Data_Clean.xlsx "
raw <- read_excel(data_path)
dat <- raw %>%
dplyr::select(Diagnosed, SNAP) %>%
mutate(
Diagnosed = as.numeric(Diagnosed),
SNAP = as.numeric(SNAP)
) %>%
filter(!is.na(Diagnosed), !is.na(SNAP))
str(dat)
## tibble [371 × 2] (S3: tbl_df/tbl/data.frame)
## $ Diagnosed: num [1:371] 19.4 13 12.7 11.4 13.2 19.9 11.7 9.8 14.2 10.3 ...
## $ SNAP : num [1:371] 28.3 6.7 20.7 7.9 5.9 27 13.6 8.5 21.2 19.5 ...
names(dat)
## [1] "Diagnosed" "SNAP"
knitr::opts_chunk$set(echo = TRUE)
df <- raw %>%
dplyr::select(Diagnosed, SNAP) %>%
mutate(
Diagnosed = as.numeric(Diagnosed),
SNAP = as.numeric(SNAP)
) %>%
filter(!is.na(Diagnosed), !is.na(SNAP))
summary(df)
## Diagnosed SNAP
## Min. : 2.50 Min. : 2.00
## 1st Qu.:12.70 1st Qu.: 9.90
## Median :15.30 Median :16.70
## Mean :16.46 Mean :19.71
## 3rd Qu.:20.15 3rd Qu.:27.45
## Max. :29.50 Max. :64.70
nrow(df)
## [1] 371
cor_pearson <- cor(df$Diagnosed, df$SNAP, method = "pearson")
cor_pearson
## [1] 0.8395643
cor_kendall <- cor(df$Diagnosed, df$SNAP, method = "kendall")
pairs(~ Diagnosed + SNAP, data = df, main = "Pairs plot: Diagnosed & SNAP")
kendall_test <- cor.test(df$Diagnosed, df$SNAP, method = "kendall", exact = FALSE)
kendall_test
##
## Kendall's rank correlation tau
##
## data: df$Diagnosed and df$SNAP
## z = 17.979, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.6280234
The correlation is positive, more SNAP is linked to more Diagnosed; if it would have been negative, more SNAP is linked to less Diagnosed. The Kendall p-value tells you if that link is likely real or just noise. Use Kendall’s tau because it handles skewed data and outlines well and also works well with smaller data sets and it was highly recommended by you in class :-) - Aside from this correlation it is also the cost of healthier food that impacts diagnosed diabetes cases.