#loading library and reading datasets needed

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

setwd("C:/Users/Az/Downloads/My Class Stuff/Monday Class")

dog_bite<-read_csv("Dog_Bite_Data_20260204.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 76472 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Bite Number, Bite Type, Incident Date, Victim Relationship, Bite L...
## dbl  (2): Victim Age, Treatment Cost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
census<-read_excel("DECENNIALDHC2020.P1-2026-03-23T192017.xlsx", 
    sheet = "Data")
#making zip code into long to merge with dog data 

zip_long <- census %>%
  filter(Label == "Total") %>%                      
  pivot_longer(
    cols = -c(Label, `Dallas city, Texas`), 
    names_to = "zipcode",
    values_to = "population") %>%
  mutate(zipcode = str_remove(zipcode, "^ZCTA5\\s+"),  
    population = as.numeric(gsub(",", "", population))) %>%
  select(zipcode,population)
#selecting only the variables that I need 

dog_bite_clean<-dog_bite %>% select(`Bite Number`,`Bite Type`,`Victim Relationship`,`Bite Circumstance`,`Incident Location`, `Victim Age`) %>% drop_na() 
#merging two dataset together 

dog_bite_clean <- dog_bite_clean %>% mutate(zip_code=str_sub(`Incident Location`,-5))

dog_bite_clean<-left_join(dog_bite_clean,zip_long,by=join_by(zip_code==zipcode))

dog_bite_clean_2<-dog_bite_clean %>% drop_na()
#Cleaning up th bite type to include only bites
dog_bite_clean_2 <- dog_bite_clean_2 %>% filter(`Bite Type`== "BITE")

#Making binary measure for the zip codes
dog_bite_clean_2 <- dog_bite_clean_2 %>%  mutate(program_zip=ifelse(zip_code=="75238",1,
                               ifelse(zip_code=="75231",1,
                               ifelse(zip_code=="75228",1,0))))
#selecting the variables that I need only
dog_bite_clean_3 <- dog_bite_clean_2 %>% select(`Bite Type`,`Victim Relationship`,`Bite Circumstance`, `Victim Age`, zip_code, population, program_zip)
#picking our a variable that I am interested in - STRANGER
stranger_model <- dog_bite_clean_3 %>% mutate(STRANGER=ifelse(`Victim Relationship`=="STRANGER", 1,0))


#picking our a variable that I am interested in - RELATIVE
relatives_model <- dog_bite_clean_3 %>% mutate(RELATIVES=ifelse(`Victim Relationship`=="RELATIVES", 1,0))

#picking another variable to analyze- bite circumstance: WALKING
walking_model <- dog_bite_clean_3 %>% mutate(WALKING= ifelse(`Bite Circumstance`=="WALKING", 1,0))

#another observation for bite circumstance: petting
petting_model <- dog_bite_clean_3 %>% mutate(PETTING= ifelse(`Bite Circumstance`=="PETTING", 1,0))
#combining all binary models into one 
final_dog_data <- dog_bite_clean_3 %>%
  mutate(
    PETTING = ifelse(`Bite Circumstance` == "PETTING", 1, 0),
    WALKING = ifelse(`Bite Circumstance` == "WALKING", 1, 0),
    STRANGER = ifelse(`Victim Relationship` == "STRANGER", 1, 0),
    RELATIVES= ifelse(`Victim Relationship`=="RELATIVES", 1,0))
#Overall descriptive statistics for variables of interest
#1. bite circumstance

final_dog_data %>% count(`Bite Circumstance`) %>% 
  mutate(percentage = n/sum(n) * 100)
## # A tibble: 12 × 3
##    `Bite Circumstance`     n percentage
##    <chr>               <int>      <dbl>
##  1 CUT GRASS             311      0.759
##  2 DLRY SRVC             640      1.56 
##  3 EXCERCISE            1030      2.51 
##  4 FEEDING              1408      3.43 
##  5 FIGHTING             4376     10.7  
##  6 HANDLING             9128     22.3  
##  7 HUGGING               825      2.01 
##  8 MAIL DELIV            926      2.26 
##  9 PETTING              3754      9.16 
## 10 RIDE BIKE             721      1.76 
## 11 SICK/INJ              265      0.646
## 12 WALKING             17616     43.0
#2. victim relationship
final_dog_data %>% count(`Victim Relationship`) %>%
  mutate(percentage = n/sum(n)* 100)
## # A tibble: 15 × 3
##    `Victim Relationship`     n percentage
##    <chr>                 <int>      <dbl>
##  1 DASEMPLOYE              243     0.593 
##  2 DASVOLNTER               18     0.0439
##  3 DELIVERY                842     2.05  
##  4 EMPLOYEE                494     1.20  
##  5 FOSTER                  139     0.339 
##  6 FRIEND                  738     1.8   
##  7 NEIGHBOR               7038    17.2   
##  8 OWNED                    26     0.0634
##  9 OWNER/VIC              8907    21.7   
## 10 RELATIVES              3348     8.17  
## 11 STRANGER              17959    43.8   
## 12 STRAY                     9     0.0220
## 13 VET                    1225     2.99  
## 14 VICTIM                    5     0.0122
## 15 VOLUNTEER                 9     0.0220
#3. victim age descriptive statistics
final_dog_data %>% summary(`Victim Age`)
##   Bite Type         Victim Relationship Bite Circumstance    Victim Age    
##  Length:41000       Length:41000        Length:41000       Min.   :  0.00  
##  Class :character   Class :character    Class :character   1st Qu.: 15.00  
##  Mode  :character   Mode  :character    Mode  :character   Median : 29.00  
##                                                            Mean   : 32.25  
##                                                            3rd Qu.: 47.00  
##                                                            Max.   :768.00  
##    zip_code           population     program_zip         PETTING       
##  Length:41000       Min.   :  830   Min.   :0.00000   Min.   :0.00000  
##  Class :character   1st Qu.:25867   1st Qu.:0.00000   1st Qu.:0.00000  
##  Mode  :character   Median :33766   Median :0.00000   Median :0.00000  
##                     Mean   :41333   Mean   :0.08615   Mean   :0.09156  
##                     3rd Qu.:59103   3rd Qu.:0.00000   3rd Qu.:0.00000  
##                     Max.   :85330   Max.   :1.00000   Max.   :1.00000  
##     WALKING          STRANGER       RELATIVES      
##  Min.   :0.0000   Min.   :0.000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.000   Median :0.00000  
##  Mean   :0.4297   Mean   :0.438   Mean   :0.08166  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.00000
#running the regressions for all variables: 
#stranger model
stranger_age_model <- glm(STRANGER~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))

summary(stranger_age_model)
## 
## Call:
## glm(formula = STRANGER ~ population + program_zip + `Victim Age`, 
##     family = binomial(link = "logit"), data = final_dog_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.482e-01  2.764e-02 -23.454  < 2e-16 ***
## population    2.290e-06  4.650e-07   4.925 8.42e-07 ***
## program_zip  -2.164e-01  3.677e-02  -5.885 3.97e-09 ***
## `Victim Age`  9.966e-03  4.882e-04  20.413  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56207  on 40999  degrees of freedom
## Residual deviance: 55734  on 40996  degrees of freedom
## AIC: 55742
## 
## Number of Fisher Scoring iterations: 4
#relatives model
relatives_age_model <- glm(RELATIVES~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(relatives_age_model)
## 
## Call:
## glm(formula = RELATIVES ~ population + program_zip + `Victim Age`, 
##     family = binomial(link = "logit"), data = final_dog_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.580e+00  4.911e-02 -32.182  < 2e-16 ***
## population    6.656e-06  8.209e-07   8.109 5.11e-16 ***
## program_zip   1.646e-01  6.104e-02   2.696  0.00702 ** 
## `Victim Age` -4.554e-02  1.228e-03 -37.083  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23190  on 40999  degrees of freedom
## Residual deviance: 21235  on 40996  degrees of freedom
## AIC: 21243
## 
## Number of Fisher Scoring iterations: 6
#walking model
walking_age_model <- glm(WALKING~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))

summary(walking_age_model)
## 
## Call:
## glm(formula = WALKING ~ population + program_zip + `Victim Age`, 
##     family = binomial(link = "logit"), data = final_dog_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -8.234e-01  2.793e-02 -29.483  < 2e-16 ***
## population    4.582e-06  4.668e-07   9.816  < 2e-16 ***
## program_zip  -2.948e-01  3.714e-02  -7.935  2.1e-15 ***
## `Victim Age`  1.157e-02  4.921e-04  23.512  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 56024  on 40999  degrees of freedom
## Residual deviance: 55346  on 40996  degrees of freedom
## AIC: 55354
## 
## Number of Fisher Scoring iterations: 4
#petting model
petting_age_model <- glm(PETTING ~ population + program_zip + `Victim Age`, data = final_dog_data,family = binomial(link="logit"))

summary(petting_age_model)
## 
## Call:
## glm(formula = PETTING ~ population + program_zip + `Victim Age`, 
##     family = binomial(link = "logit"), data = final_dog_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.250e+00  4.567e-02 -27.372  < 2e-16 ***
## population   -3.191e-06  8.108e-07  -3.935 8.30e-05 ***
## program_zip   2.809e-01  5.831e-02   4.818 1.45e-06 ***
## `Victim Age` -3.515e-02  1.059e-03 -33.196  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25103  on 40999  degrees of freedom
## Residual deviance: 23752  on 40996  degrees of freedom
## AIC: 23760
## 
## Number of Fisher Scoring iterations: 6