Homework 4

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(pastecs)

## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(readxl)
library(dplyr)
setwd("C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes")
Diabetes_Data <- read_excel("C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes/Diabetes Data.xlsx")

Diabetes_Data <- read_excel("Diabetes Data.xlsx")

Cleaned_Diabetes_Data <- Diabetes_Data %>%
  select(Diagnosed, SNAP) %>%
  drop_na()

pastecs::stat.desc(Cleaned_Diabetes_Data$Diagnosed, norm = T)

##       nbr.val      nbr.null        nbr.na           min           max 
##  3.710000e+02  0.000000e+00  0.000000e+00  2.500000e+00  2.950000e+01 
##         range           sum        median          mean       SE.mean 
##  2.700000e+01  6.106900e+03  1.530000e+01  1.646065e+01  2.512205e-01 
##  CI.mean.0.95           var       std.dev      coef.var      skewness 
##  4.939989e-01  2.341445e+01  4.838848e+00  2.939646e-01  2.808935e-01 
##      skew.2SE      kurtosis      kurt.2SE    normtest.W    normtest.p 
##  1.108840e+00 -5.697677e-01 -1.127562e+00  9.700359e-01  6.371567e-07

summary(Cleaned_Diabetes_Data$Diagnosed)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.50   12.70   15.30   16.46   20.15   29.50

Observation: The variable is how many SNAP recipients have been diagnosed with diabetes.The summary shows that mean prevalence is about 16.5%, with values ranging from 2.5% to nearly 30%. Skewness is slightly positive, meaning more census tracts cluster on the lower-to-mid end, with fewer extreme high-prevalence tracts.

hist(Cleaned_Diabetes_Data$SNAP)

UpdatedData<- Cleaned_Diabetes_Data %>% mutate(SNAP_log=log(SNAP))

hist(UpdatedData$SNAP_log)

Homework 4

Michelle Vigil

2025-10-08