library(readr)
## Warning: package 'readr' was built under R version 4.3.3
HAB <- read_csv("C:/Users/scott6522/Downloads/Health_AnimalBites (1).csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 9003 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): SpeciesIDDesc, BreedIDDesc, GenderIDDesc, color, AdvIssuedYNDesc, ...
## dbl  (2): vaccination_yrs, victim_zip
## dttm (5): bite_date, vaccination_date, quarantine_date, head_sent_date, rele...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(HAB)
str(HAB)
## spc_tbl_ [9,003 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ bite_date        : POSIXct[1:9003], format: "1985-05-05" "1986-02-12" ...
##  $ SpeciesIDDesc    : chr [1:9003] "DOG" "DOG" "DOG" "DOG" ...
##  $ BreedIDDesc      : chr [1:9003] NA NA NA NA ...
##  $ GenderIDDesc     : chr [1:9003] "FEMALE" "UNKNOWN" "UNKNOWN" "MALE" ...
##  $ color            : chr [1:9003] "LIG. BROWN" "BRO & BLA" NA "BLA & BRO" ...
##  $ vaccination_yrs  : num [1:9003] 1 NA NA NA NA NA 1 NA NA NA ...
##  $ vaccination_date : POSIXct[1:9003], format: "1985-06-20" NA ...
##  $ victim_zip       : num [1:9003] 40229 40218 40219 NA NA ...
##  $ AdvIssuedYNDesc  : chr [1:9003] "NO" "NO" "NO" "NO" ...
##  $ WhereBittenIDDesc: chr [1:9003] "BODY" "BODY" "BODY" "BODY" ...
##  $ quarantine_date  : POSIXct[1:9003], format: "1985-05-05" "1986-02-12" ...
##  $ DispositionIDDesc: chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
##  $ head_sent_date   : POSIXct[1:9003], format: NA NA ...
##  $ release_date     : POSIXct[1:9003], format: NA NA ...
##  $ ResultsIDDesc    : chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   bite_date = col_datetime(format = ""),
##   ..   SpeciesIDDesc = col_character(),
##   ..   BreedIDDesc = col_character(),
##   ..   GenderIDDesc = col_character(),
##   ..   color = col_character(),
##   ..   vaccination_yrs = col_double(),
##   ..   vaccination_date = col_datetime(format = ""),
##   ..   victim_zip = col_double(),
##   ..   AdvIssuedYNDesc = col_character(),
##   ..   WhereBittenIDDesc = col_character(),
##   ..   quarantine_date = col_datetime(format = ""),
##   ..   DispositionIDDesc = col_character(),
##   ..   head_sent_date = col_datetime(format = ""),
##   ..   release_date = col_datetime(format = ""),
##   ..   ResultsIDDesc = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(HAB)
##    bite_date                      SpeciesIDDesc      BreedIDDesc       
##  Min.   :1952-05-28 00:00:00.00   Length:9003        Length:9003       
##  1st Qu.:2011-11-01 00:00:00.00   Class :character   Class :character  
##  Median :2013-09-07 00:00:00.00   Mode  :character   Mode  :character  
##  Mean   :2014-02-01 06:55:17.33                                        
##  3rd Qu.:2015-08-16 18:00:00.00                                        
##  Max.   :5013-07-15 00:00:00.00                                        
##  NA's   :317                                                           
##  GenderIDDesc          color           vaccination_yrs 
##  Length:9003        Length:9003        Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 1.000  
##                                        Mean   : 1.452  
##                                        3rd Qu.: 1.000  
##                                        Max.   :11.000  
##                                        NA's   :5265    
##  vaccination_date                   victim_zip    AdvIssuedYNDesc   
##  Min.   :1985-06-20 00:00:00.00   Min.   : 6107   Length:9003       
##  1st Qu.:2011-07-05 12:00:00.00   1st Qu.:40211   Class :character  
##  Median :2013-06-05 00:00:00.00   Median :40218   Mode  :character  
##  Mean   :2013-05-17 03:39:03.72   Mean   :40437                     
##  3rd Qu.:2015-04-02 00:00:00.00   3rd Qu.:40243                     
##  Max.   :2018-07-21 00:00:00.00   Max.   :99674                     
##  NA's   :4888                     NA's   :1841                      
##  WhereBittenIDDesc  quarantine_date                  DispositionIDDesc 
##  Length:9003        Min.   :1985-05-05 00:00:00.00   Length:9003       
##  Class :character   1st Qu.:2010-06-09 00:00:00.00   Class :character  
##  Mode  :character   Median :2010-11-13 12:00:00.00   Mode  :character  
##                     Mean   :2010-09-08 11:55:00.58                     
##                     3rd Qu.:2011-06-16 00:00:00.00                     
##                     Max.   :2017-09-01 00:00:00.00                     
##                     NA's   :6983                                       
##  head_sent_date                    release_date                   
##  Min.   :2010-01-10 00:00:00.00   Min.   :2011-05-09 00:00:00.00  
##  1st Qu.:2012-08-25 12:00:00.00   1st Qu.:2015-11-03 00:00:00.00  
##  Median :2015-09-08 00:00:00.00   Median :2016-06-13 00:00:00.00  
##  Mean   :2014-11-25 22:47:05.31   Mean   :2016-04-17 12:31:23.46  
##  3rd Qu.:2016-09-12 12:00:00.00   3rd Qu.:2017-02-22 00:00:00.00  
##  Max.   :2017-09-05 00:00:00.00   Max.   :2019-05-19 00:00:00.00  
##  NA's   :8608                     NA's   :7558                    
##  ResultsIDDesc     
##  Length:9003       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
HAB<- select(HAB, -BreedIDDesc)
HAB<- select(HAB, -color)
HAB<- select(HAB, -head_sent_date)
HAB<- select(HAB, -release_date)
HAB<- select(HAB, -quarantine_date)
HAB<- select(HAB, -vaccination_date)
HAB<- select(HAB, -bite_date)
HAB<- select(HAB, -WhereBittenIDDesc)
HAB<- select(HAB, -DispositionIDDesc)
HAB<- select(HAB, -AdvIssuedYNDesc)
HAB<- select(HAB, -GenderIDDesc)

HAB$vaccination_yrs[is.na(HAB$vaccination_yrs)] <- median(HAB$vaccination_yrs, na.rm = TRUE)
View(HAB)
str(HAB)
## tibble [9,003 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SpeciesIDDesc  : chr [1:9003] "DOG" "DOG" "DOG" "DOG" ...
##  $ vaccination_yrs: num [1:9003] 1 1 1 1 1 1 1 1 1 1 ...
##  $ victim_zip     : num [1:9003] 40229 40218 40219 NA NA ...
##  $ ResultsIDDesc  : chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
HAB1 <- HAB[!is.na(HAB$ResultsIDDesc) & HAB$ResultsIDDesc != "", ]
str(HAB1)
## tibble [1,543 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SpeciesIDDesc  : chr [1:1543] "DOG" "DOG" "DOG" "DOG" ...
##  $ vaccination_yrs: num [1:1543] 1 1 1 1 1 1 1 1 1 1 ...
##  $ victim_zip     : num [1:1543] 40229 40218 40219 NA NA ...
##  $ ResultsIDDesc  : chr [1:1543] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
HAB1 <- HAB1[HAB1$ResultsIDDesc != "UNKNOWN", ]
str(HAB1)
## tibble [303 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SpeciesIDDesc  : chr [1:303] "DOG" "DOG" "DOG" "CAT" ...
##  $ vaccination_yrs: num [1:303] 1 1 1 1 1 1 1 1 1 1 ...
##  $ victim_zip     : num [1:303] 40214 40291 40216 40215 40212 ...
##  $ ResultsIDDesc  : chr [1:303] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
HAB1 <- HAB1[!is.na(HAB1$SpeciesIDDesc) & HAB1$SpeciesIDDesc != "", ]


str(HAB1)
## tibble [300 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SpeciesIDDesc  : chr [1:300] "DOG" "DOG" "DOG" "CAT" ...
##  $ vaccination_yrs: num [1:300] 1 1 1 1 1 1 1 1 1 1 ...
##  $ victim_zip     : num [1:300] 40214 40291 40216 40215 40212 ...
##  $ ResultsIDDesc  : chr [1:300] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
HAB1<- HAB1[!is.na(HAB1$victim_zip) & HAB1$victim_zip != "",]
str(HAB1)
## tibble [279 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SpeciesIDDesc  : chr [1:279] "DOG" "DOG" "DOG" "CAT" ...
##  $ vaccination_yrs: num [1:279] 1 1 1 1 1 1 1 1 1 1 ...
##  $ victim_zip     : num [1:279] 40214 40291 40216 40215 40212 ...
##  $ ResultsIDDesc  : chr [1:279] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
View(HAB1)
summary(HAB1)
##  SpeciesIDDesc      vaccination_yrs   victim_zip    ResultsIDDesc     
##  Length:279         Min.   :1.000   Min.   :40014   Length:279        
##  Class :character   1st Qu.:1.000   1st Qu.:40207   Class :character  
##  Mode  :character   Median :1.000   Median :40214   Mode  :character  
##                     Mean   :1.014   Mean   :40320                     
##                     3rd Qu.:1.000   3rd Qu.:40228                     
##                     Max.   :3.000   Max.   :47150
positive_cases <- HAB1[HAB1$ResultsIDDesc == "POSITIVE", ]
species_counts <- table(positive_cases$SpeciesIDDesc)

species_summary <- as.data.frame(species_counts)
names(species_summary) <- c("Species", "Count")

ggplot(species_summary, aes(x = Species, y = Count, fill = Species)) +
  geom_bar(stat = "identity", color = "black") +
  labs(title = "Positive Rabies Cases by Species", x = "Species", y = "Number of Positive Cases") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # Rotate x-axis labels for clarity

set.seed(123)

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
HAB1$victim_zip<- as.numeric(as.factor(HAB1$victim_zip))
HAB1$SpeciesIDDesc <- as.numeric(as.factor(HAB1$SpeciesIDDesc))
data_clustering <- HAB1[c("vaccination_yrs", "victim_zip", "SpeciesIDDesc")]
sapply(data_clustering, is.numeric)
## vaccination_yrs      victim_zip   SpeciesIDDesc 
##            TRUE            TRUE            TRUE
summary(data_clustering)
##  vaccination_yrs   victim_zip    SpeciesIDDesc  
##  Min.   :1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:16.00   1st Qu.:1.000  
##  Median :1.000   Median :22.00   Median :1.000  
##  Mean   :1.014   Mean   :23.78   Mean   :1.731  
##  3rd Qu.:1.000   3rd Qu.:30.00   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :44.00   Max.   :5.000
data_scaled <- scale.default(data_clustering)
distance_mat<-dist(data_scaled, method = "euclidean")

hclust_avg<-hclust(distance_mat, method = "average")

cut_age<-cutree(hclust_avg, k=3)

view(cut_age)

plot(cut_age)

HAB1_cl<-mutate(HAB1, cluster=cut_age)
plot(HAB1_cl)

kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)

print(kmeans_result$centers)
##   vaccination_yrs   victim_zip SpeciesIDDesc
## 1     11.74749268  1.115572108     1.2807621
## 2     -0.08481944 -0.051496555     1.5900987
## 3     -0.08481944  0.004472754    -0.4704542
trainControl<-trainControl(method="repeatedcv",number=10,repeats=3)
#CART
set.seed(7)
fit.cart<-train(ResultsIDDesc~.,data=HAB1,method="rpart",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#LDA
set.seed(7)
fit.lda<-train(ResultsIDDesc~.,data=HAB1,method="lda",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#SVM
set.seed(7)
fit.svm<-train(ResultsIDDesc~.,data=HAB1,method="svmRadial",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#KNN
set.seed(7)
fit.knn<-train(ResultsIDDesc~.,data=HAB1,method="knn",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#rf
set.seed(7)
fit.rf<-train(ResultsIDDesc~.,data=HAB1,method="rf",trControl=trainControl)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#collect resamples
results<-resamples(list(CART=fit.cart,LDA=fit.lda,SVM=fit.svm,KNN=fit.knn,RF=fit.rf))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: CART, LDA, SVM, KNN, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu. Median      Mean 3rd Qu. Max. NA's
## CART 0.9642857 0.9655172      1 0.9859606       1    1    0
## LDA  0.9642857 0.9655172      1 0.9859606       1    1    0
## SVM  0.9642857 0.9655172      1 0.9859606       1    1    0
## KNN  0.9642857 0.9655172      1 0.9859606       1    1    0
## RF   0.9642857 0.9655172      1 0.9859606       1    1    0
## 
## Kappa 
##      Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART    0       0      0    0       0    0   18
## LDA     0       0      0    0       0    0   18
## SVM     0       0      0    0       0    0   18
## KNN     0       0      0    0       0    0   18
## RF      0       0      0    0       0    0   18
prop.table(table(HAB1$ResultsIDDesc))
## 
##   NEGATIVE   POSITIVE 
## 0.98566308 0.01433692
#Box and whisker plot
scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)

#density plot
densityplot(results, scales=scales,pch="|")

#dot plot
dotplot(results,scales=scales)

splom(results)

#difference in plots
diffs<-diff(results)
summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      CART LDA SVM KNN RF
## CART       0   0   0   0
## LDA  NA        0   0   0
## SVM  NA   NA       0   0
## KNN  NA   NA  NA       0
## RF   NA   NA  NA  NA    
## 
## Kappa 
##      CART LDA SVM KNN RF
## CART       0   0   0   0
## LDA  NA        0   0   0
## SVM  NA   NA       0   0
## KNN  NA   NA  NA       0
## RF   NA   NA  NA  NA