#loading library and reading datasets needed
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
setwd("C:/Users/Az/Downloads/My Class Stuff/Monday Class")
dog_bite<-read_csv("Dog_Bite_Data_20260204.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 76472 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Bite Number, Bite Type, Incident Date, Victim Relationship, Bite L...
## dbl (2): Victim Age, Treatment Cost
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
census<-read_excel("DECENNIALDHC2020.P1-2026-03-23T192017.xlsx",
sheet = "Data")
#making zip code into long to merge with dog data
zip_long <- census %>%
filter(Label == "Total") %>%
pivot_longer(
cols = -c(Label, `Dallas city, Texas`),
names_to = "zipcode",
values_to = "population") %>%
mutate(zipcode = str_remove(zipcode, "^ZCTA5\\s+"),
population = as.numeric(gsub(",", "", population))) %>%
select(zipcode,population)
#selecting only the variables that I need
dog_bite_clean<-dog_bite %>% select(`Bite Number`,`Bite Type`,`Victim Relationship`,`Bite Circumstance`,`Incident Location`, `Victim Age`) %>% drop_na()
#merging two dataset together
dog_bite_clean <- dog_bite_clean %>% mutate(zip_code=str_sub(`Incident Location`,-5))
dog_bite_clean<-left_join(dog_bite_clean,zip_long,by=join_by(zip_code==zipcode))
dog_bite_clean_2<-dog_bite_clean %>% drop_na()
#Cleaning up th bite type to include only bites
dog_bite_clean_2 <- dog_bite_clean_2 %>% filter(`Bite Type`== "BITE")
#Making binary measure for the zip codes
dog_bite_clean_2 <- dog_bite_clean_2 %>% mutate(program_zip=ifelse(zip_code=="75238",1,
ifelse(zip_code=="75231",1,
ifelse(zip_code=="75228",1,0))))
#selecting the variables that I need only
dog_bite_clean_3 <- dog_bite_clean_2 %>% select(`Bite Type`,`Victim Relationship`,`Bite Circumstance`, `Victim Age`, zip_code, population, program_zip)
#picking our a variable that I am interested in - STRANGER
stranger_model <- dog_bite_clean_3 %>% mutate(STRANGER=ifelse(`Victim Relationship`=="STRANGER", 1,0))
#picking our a variable that I am interested in - RELATIVE
relatives_model <- dog_bite_clean_3 %>% mutate(RELATIVES=ifelse(`Victim Relationship`=="RELATIVES", 1,0))
#picking another variable to analyze- bite circumstance: WALKING
walking_model <- dog_bite_clean_3 %>% mutate(WALKING= ifelse(`Bite Circumstance`=="WALKING", 1,0))
#another observation for bite circumstance: petting
petting_model <- dog_bite_clean_3 %>% mutate(PETTING= ifelse(`Bite Circumstance`=="PETTING", 1,0))
#combining all binary models into one
final_dog_data <- dog_bite_clean_3 %>%
mutate(
PETTING = ifelse(`Bite Circumstance` == "PETTING", 1, 0),
WALKING = ifelse(`Bite Circumstance` == "WALKING", 1, 0),
STRANGER = ifelse(`Victim Relationship` == "STRANGER", 1, 0),
RELATIVES= ifelse(`Victim Relationship`=="RELATIVES", 1,0))
#Overall descriptive statistics for variables of interest
#1. bite circumstance
final_dog_data %>% count(`Bite Circumstance`) %>%
mutate(percentage = n/sum(n) * 100)
## # A tibble: 12 × 3
## `Bite Circumstance` n percentage
## <chr> <int> <dbl>
## 1 CUT GRASS 311 0.759
## 2 DLRY SRVC 640 1.56
## 3 EXCERCISE 1030 2.51
## 4 FEEDING 1408 3.43
## 5 FIGHTING 4376 10.7
## 6 HANDLING 9128 22.3
## 7 HUGGING 825 2.01
## 8 MAIL DELIV 926 2.26
## 9 PETTING 3754 9.16
## 10 RIDE BIKE 721 1.76
## 11 SICK/INJ 265 0.646
## 12 WALKING 17616 43.0
#2. victim relationship
final_dog_data %>% count(`Victim Relationship`) %>%
mutate(percentage = n/sum(n)* 100)
## # A tibble: 15 × 3
## `Victim Relationship` n percentage
## <chr> <int> <dbl>
## 1 DASEMPLOYE 243 0.593
## 2 DASVOLNTER 18 0.0439
## 3 DELIVERY 842 2.05
## 4 EMPLOYEE 494 1.20
## 5 FOSTER 139 0.339
## 6 FRIEND 738 1.8
## 7 NEIGHBOR 7038 17.2
## 8 OWNED 26 0.0634
## 9 OWNER/VIC 8907 21.7
## 10 RELATIVES 3348 8.17
## 11 STRANGER 17959 43.8
## 12 STRAY 9 0.0220
## 13 VET 1225 2.99
## 14 VICTIM 5 0.0122
## 15 VOLUNTEER 9 0.0220
#3. victim age descriptive statistics
final_dog_data %>% summary(`Victim Age`)
## Bite Type Victim Relationship Bite Circumstance Victim Age
## Length:41000 Length:41000 Length:41000 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 15.00
## Mode :character Mode :character Mode :character Median : 29.00
## Mean : 32.25
## 3rd Qu.: 47.00
## Max. :768.00
## zip_code population program_zip PETTING
## Length:41000 Min. : 830 Min. :0.00000 Min. :0.00000
## Class :character 1st Qu.:25867 1st Qu.:0.00000 1st Qu.:0.00000
## Mode :character Median :33766 Median :0.00000 Median :0.00000
## Mean :41333 Mean :0.08615 Mean :0.09156
## 3rd Qu.:59103 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :85330 Max. :1.00000 Max. :1.00000
## WALKING STRANGER RELATIVES
## Min. :0.0000 Min. :0.000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00000
## Median :0.0000 Median :0.000 Median :0.00000
## Mean :0.4297 Mean :0.438 Mean :0.08166
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.000 Max. :1.00000
#running the regressions for all variables:
#stranger model
stranger_age_model <- glm(STRANGER~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))
summary(stranger_age_model)
##
## Call:
## glm(formula = STRANGER ~ population + program_zip + `Victim Age`,
## family = binomial(link = "logit"), data = final_dog_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.482e-01 2.764e-02 -23.454 < 2e-16 ***
## population 2.290e-06 4.650e-07 4.925 8.42e-07 ***
## program_zip -2.164e-01 3.677e-02 -5.885 3.97e-09 ***
## `Victim Age` 9.966e-03 4.882e-04 20.413 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 56207 on 40999 degrees of freedom
## Residual deviance: 55734 on 40996 degrees of freedom
## AIC: 55742
##
## Number of Fisher Scoring iterations: 4
#relatives model
relatives_age_model <- glm(RELATIVES~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(relatives_age_model)
##
## Call:
## glm(formula = RELATIVES ~ population + program_zip + `Victim Age`,
## family = binomial(link = "logit"), data = final_dog_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.580e+00 4.911e-02 -32.182 < 2e-16 ***
## population 6.656e-06 8.209e-07 8.109 5.11e-16 ***
## program_zip 1.646e-01 6.104e-02 2.696 0.00702 **
## `Victim Age` -4.554e-02 1.228e-03 -37.083 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23190 on 40999 degrees of freedom
## Residual deviance: 21235 on 40996 degrees of freedom
## AIC: 21243
##
## Number of Fisher Scoring iterations: 6
#walking model
walking_age_model <- glm(WALKING~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))
summary(walking_age_model)
##
## Call:
## glm(formula = WALKING ~ population + program_zip + `Victim Age`,
## family = binomial(link = "logit"), data = final_dog_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.234e-01 2.793e-02 -29.483 < 2e-16 ***
## population 4.582e-06 4.668e-07 9.816 < 2e-16 ***
## program_zip -2.948e-01 3.714e-02 -7.935 2.1e-15 ***
## `Victim Age` 1.157e-02 4.921e-04 23.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 56024 on 40999 degrees of freedom
## Residual deviance: 55346 on 40996 degrees of freedom
## AIC: 55354
##
## Number of Fisher Scoring iterations: 4
#petting model
petting_age_model <- glm(PETTING ~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))
summary(petting_age_model)
##
## Call:
## glm(formula = PETTING ~ population + program_zip + `Victim Age`,
## family = binomial(link = "logit"), data = final_dog_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.250e+00 4.567e-02 -27.372 < 2e-16 ***
## population -3.191e-06 8.108e-07 -3.935 8.30e-05 ***
## program_zip 2.809e-01 5.831e-02 4.818 1.45e-06 ***
## `Victim Age` -3.515e-02 1.059e-03 -33.196 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25103 on 40999 degrees of freedom
## Residual deviance: 23752 on 40996 degrees of freedom
## AIC: 23760
##
## Number of Fisher Scoring iterations: 6