library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.2.0     âś” readr     2.1.6
## âś” forcats   1.0.1     âś” stringr   1.6.0
## âś” ggplot2   4.0.2     âś” tibble    3.3.1
## âś” lubridate 1.9.5     âś” tidyr     1.3.2
## âś” purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)



setwd("C:/Users/Az/Downloads/My Class Stuff/Monday Class")

dog_bite<-read_csv("Dog_Bite_Data_20260204.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 76472 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Bite Number, Bite Type, Incident Date, Victim Relationship, Bite L...
## dbl  (2): Victim Age, Treatment Cost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
census<-read_excel("DECENNIALDHC2020.P1-2026-03-23T192017.xlsx", 
    sheet = "Data")
#Making population data the right format

zip_long <- census %>%
  filter(Label == "Total") %>%                      
  pivot_longer(
    cols = -c(Label, `Dallas city, Texas`), 
    names_to = "zipcode",
    values_to = "population") %>%
  mutate(zipcode = str_remove(zipcode, "^ZCTA5\\s+"),  
    population = as.numeric(gsub(",", "", population))) %>%
  dplyr::select(zipcode, population)
#cleaning data 

dog_bite_clean<-dog_bite %>% dplyr::select(`Bite Number`,`Bite Type`,`Victim Relationship`,`Bite Circumstance`,`Incident Location`, `Victim Age`) %>% drop_na()

dog_bite_clean <- dog_bite_clean %>% mutate(zip_code=str_sub(`Incident Location`,-5))

dog_bite_clean<-left_join(dog_bite_clean,zip_long,by=join_by(zip_code==zipcode))

dog_bite_clean_2<-dog_bite_clean %>% drop_na()
#Cleaning up th bite type to include only bites
dog_bite_clean_2 <- dog_bite_clean_2 %>% filter(`Bite Type`== "BITE")

#Making binary measure for the zip codes
dog_bite_clean_2 <- dog_bite_clean_2 %>%  mutate(program_zip=ifelse(zip_code=="75238",1,
                               ifelse(zip_code=="75231",1,
                               ifelse(zip_code=="75228",1,0))))
#selecting the variables that I need only
dog_bite_clean_3 <- dog_bite_clean_2 %>% dplyr::select(`Bite Type`,`Victim Relationship`,`Bite Circumstance`, `Victim Age`, zip_code, population, program_zip)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#2) Create a linear model "lm()" from the variables, with a continuous dependent variable as the outcome


age_linear_model <- lm(`Victim Age` ~ `Victim Relationship` + `Bite Circumstance`, data= dog_bite_clean_3)

summary(age_linear_model)
## 
## Call:
## lm(formula = `Victim Age` ~ `Victim Relationship` + `Bite Circumstance`, 
##     data = dog_bite_clean_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.93 -15.03  -3.67  12.97 731.97 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      47.4720     1.8794  25.260  < 2e-16 ***
## `Victim Relationship`DASVOLNTER   1.2448     5.3017   0.235   0.8144    
## `Victim Relationship`DELIVERY     2.8251     1.7470   1.617   0.1059    
## `Victim Relationship`EMPLOYEE     6.8210     1.7035   4.004 6.23e-05 ***
## `Victim Relationship`FOSTER      14.8680     2.3085   6.441 1.20e-10 ***
## `Victim Relationship`FRIEND      -1.1034     1.6139  -0.684   0.4942    
## `Victim Relationship`NEIGHBOR     2.4532     1.4358   1.709   0.0875 .  
## `Victim Relationship`OWNED        9.9500     4.4895   2.216   0.0267 *  
## `Victim Relationship`OWNER/VIC    0.3633     1.4152   0.257   0.7974    
## `Victim Relationship`RELATIVES  -10.0020     1.4502  -6.897 5.40e-12 ***
## `Victim Relationship`STRANGER     3.2600     1.4193   2.297   0.0216 *  
## `Victim Relationship`STRAY        7.9824     7.3750   1.082   0.2791    
## `Victim Relationship`VET         -0.5862     1.5250  -0.384   0.7007    
## `Victim Relationship`VICTIM      -6.7728     9.8079  -0.691   0.4899    
## `Victim Relationship`VOLUNTEER    2.1680     7.3678   0.294   0.7686    
## `Bite Circumstance`DLRY SRVC    -14.6964     1.6242  -9.049  < 2e-16 ***
## `Bite Circumstance`EXCERCISE    -22.8909     1.4096 -16.239  < 2e-16 ***
## `Bite Circumstance`FEEDING      -15.0683     1.3793 -10.924  < 2e-16 ***
## `Bite Circumstance`FIGHTING     -11.9360     1.2846  -9.292  < 2e-16 ***
## `Bite Circumstance`HANDLING     -17.2592     1.2724 -13.564  < 2e-16 ***
## `Bite Circumstance`HUGGING      -29.0481     1.4659 -19.815  < 2e-16 ***
## `Bite Circumstance`MAIL DELIV   -15.4388     1.4917 -10.350  < 2e-16 ***
## `Bite Circumstance`PETTING      -25.7989     1.2944 -19.931  < 2e-16 ***
## `Bite Circumstance`RIDE BIKE    -21.7164     1.4755 -14.718  < 2e-16 ***
## `Bite Circumstance`SICK/INJ      -9.5990     1.8339  -5.234 1.66e-07 ***
## `Bite Circumstance`WALKING      -14.6992     1.2445 -11.811  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.7 on 40974 degrees of freedom
## Multiple R-squared:  0.07662,    Adjusted R-squared:  0.07606 
## F-statistic:   136 on 25 and 40974 DF,  p-value: < 2.2e-16
#3) Check the following assumptions:
 # a) Linearity (plot and raintest)

plot(age_linear_model,which=1)

#Yes, most of the plots on this chart fall along the line with the exeption of a few outliers. 

raintest(age_linear_model)
## 
##  Rainbow test
## 
## data:  age_linear_model
## Rain = 0.74999, df1 = 20500, df2 = 20474, p-value = 1
#The P value from this Rain Test is 1. This is a very high P value, meaning that the data is linear. 

# b) Independence of errors (durbin-watson)

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
durbinWatsonTest(age_linear_model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3593692      1.281243       0
##  Alternative hypothesis: rho != 0
# The p-value is 0. Since the p- value is less than 0.5, we can say that the errors are NOT independent. 

#c) Homoscedasticity (plot, bptest)

plot(age_linear_model,which=3)

bptest(age_linear_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  age_linear_model
## BP = 46.638, df = 25, p-value = 0.005408
#Visually, the red line is mostly straight and the plots are generally around the red line. The bptest confirms this as the p-value is less than 0.05, provinf that the data is homoscedastic. 


#d) Normality of residuals (QQ plot, shapiro test)

plot(age_linear_model,which=2)

#shapiro.test(age_linear_model$residuals)

#The plot test visually confirms that the normality of residuals since the residuals follow the closely to the dotted lines, except for a few outliers. Unfotunately, I was not able to run the shapiro test with my data. 

 # e) No multicolinarity (VIF, cor)

vif(age_linear_model)
##                           GVIF Df GVIF^(1/(2*Df))
## `Victim Relationship` 3.161614 14        1.041967
## `Bite Circumstance`   3.161614 11        1.053715
#Because no VIF is over 10, the variables in this model do not seem to be strongly correlated with each other. 
  1. does your model meet those assumptions? You don’t have to be perfectly right, just make a good case.

From all the previous tests ran, my model does not meet all the assumptions. The one assumption that it violates is the idependence of errors.

  1. If your model violates an assumption, which one?

The model violates the independence of errors test since the p-value is 0.Models are indepdent only when the p-value is greater than 0.05.

  1. What would you do to mitigate this assumption? Show your work.

To mitigate this error, I would run a robust linear model instead:

age_linear_robust <- rlm(Victim Age ~ Victim Relationship + Bite Circumstance,data = dog_bite_clean_3)

summary(age_linear_robust)