title: “hw 5” author: “janice palma” date: “2025-02-26” output: html_document

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

HOMEWORK due 03/16

can use your data or old data

  1. From the data you have chosen (overdose data BEXAR COUNTY), select a variable that you are interested in : Ethnicity

  2. Use pastecs::stat.desc to describe the variable. Include a few sentences about what the variable is and what it’s measuring. Remember to load pastecs “library(pastecs)”

I will be analyzing the distribution of overdoses counts across ethnic groups to identify potential disparities and patterns in how overdoses affects different communities in BexarCounty.

  1. Remove NA’s if needed using dplyr:filter (or anything similar)

  2. Provide a histogram of the variable (as shown in this lesson)

  3. transform the variable using the log transformation or square root transformation (whatever is more appropriate) using dplyr::mutate or something similar

  4. provide a histogram of the transformed variable

  5. submit via rpubs on CANVAS

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
oddata<-read_excel("table builder_data.xlsx")
head(oddata)
## # A tibble: 6 × 7
##   Demographic `Geographic Area` `Drug Type`                  `Data Source`  Year
##   <chr>       <chr>             <chr>                        <chr>         <dbl>
## 1 Hispanic    Bexar             Synthetic Opioids            Source: 2011…  2021
## 2 Hispanic    Bexar             Psychostimulants             Source: 2011…  2021
## 3 Hispanic    Bexar             Other and Unspecified Narco… Source: 2011…  2021
## 4 Hispanic    Bexar             Opium                        Source: 2011…  2021
## 5 Hispanic    Bexar             Natural and Semisynthetic O… Source: 2011…  2021
## 6 Hispanic    Bexar             Methadone                    Source: 2011…  2021
## # ℹ 2 more variables: `Total with Zero` <chr>,
## #   `Deaths per 100,000 population` <chr>
demograhic_data <- oddata %>% 
count(Demographic,`Drug Type`,`Total with Zero`, `Deaths per 100,000 population`) %>%
rename(Count = n)
cat("descriptive statistics for overdose counts by demographics, drug type, total with zero, deaths per 100,000 population:/n")
## descriptive statistics for overdose counts by demographics, drug type, total with zero, deaths per 100,000 population:/n
stat.desc(demograhic_data$`Total with Zero`, norm = TRUE)
##    nbr.val   nbr.null     nbr.na        min        max      range        sum 
##         NA         NA         NA         NA         NA         NA         NA 
##     median       mean    SE.mean    CI.mean        var    std.dev   coef.var 
##         NA         NA         NA         NA         NA         NA         NA 
##   skewness   skew.2SE   kurtosis   kurt.2SE normtest.W normtest.p 
##         NA         NA         NA         NA         NA         NA
better_demograhic_data <- demograhic_data %>%
  select(`Total with Zero`, `Deaths per 100,000 population`) %>% na.omit(.)
better_demograhic_data$`Total with Zero`<-as.numeric(better_demograhic_data$`Total with Zero`)
## Warning: NAs introduced by coercion
hist(better_demograhic_data$`Total with Zero`)

better_demograhic_data$`Deaths per 100,000 population`<-as.numeric(better_demograhic_data$`Deaths per 100,000 population`)
## Warning: NAs introduced by coercion
hist(better_demograhic_data$`Deaths per 100,000 population`)

shapiro.test(better_demograhic_data$`Total with Zero`)
## 
##  Shapiro-Wilk normality test
## 
## data:  better_demograhic_data$`Total with Zero`
## W = 0.86789, p-value = 0.03919
hist(better_demograhic_data$`Total with Zero` , probability = T)

hist(better_demograhic_data$`Deaths per 100,000 population`,probability = T)

better_demograhic_data<-better_demograhic_data %>% mutate(better_demograhic_data_LOG_TRANSFORM=log(`Total with Zero`,`Deaths per 100,000 population`))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `better_demograhic_data_LOG_TRANSFORM = log(`Total with Zero`,
##   `Deaths per 100,000 population`)`.
## Caused by warning:
## ! NaNs produced
head(better_demograhic_data)
## # A tibble: 6 × 3
##   `Total with Zero` `Deaths per 100,000 population` better_demograhic_data_LOG…¹
##               <dbl>                           <dbl>                        <dbl>
## 1               173                            12.4                         2.05
## 2               102                             7                           2.38
## 3               119                             8.3                         2.26
## 4                13                            NA                          NA   
## 5                 0                             0                         NaN   
## 6                37                             2.9                         3.39
## # ℹ abbreviated name: ¹​better_demograhic_data_LOG_TRANSFORM
shapiro.test(better_demograhic_data$better_demograhic_data_LOG_TRANSFORM)
## 
##  Shapiro-Wilk normality test
## 
## data:  better_demograhic_data$better_demograhic_data_LOG_TRANSFORM
## W = 0.95366, p-value = 0.748
hist(better_demograhic_data$better_demograhic_data_LOG_TRANSFORM,probability = T)