library(readr)
## Warning: package 'readr' was built under R version 4.3.3
HAB <- read_csv("C:/Users/scott6522/Downloads/Health_AnimalBites (1).csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 9003 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): SpeciesIDDesc, BreedIDDesc, GenderIDDesc, color, AdvIssuedYNDesc, ...
## dbl (2): vaccination_yrs, victim_zip
## dttm (5): bite_date, vaccination_date, quarantine_date, head_sent_date, rele...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(HAB)
str(HAB)
## spc_tbl_ [9,003 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ bite_date : POSIXct[1:9003], format: "1985-05-05" "1986-02-12" ...
## $ SpeciesIDDesc : chr [1:9003] "DOG" "DOG" "DOG" "DOG" ...
## $ BreedIDDesc : chr [1:9003] NA NA NA NA ...
## $ GenderIDDesc : chr [1:9003] "FEMALE" "UNKNOWN" "UNKNOWN" "MALE" ...
## $ color : chr [1:9003] "LIG. BROWN" "BRO & BLA" NA "BLA & BRO" ...
## $ vaccination_yrs : num [1:9003] 1 NA NA NA NA NA 1 NA NA NA ...
## $ vaccination_date : POSIXct[1:9003], format: "1985-06-20" NA ...
## $ victim_zip : num [1:9003] 40229 40218 40219 NA NA ...
## $ AdvIssuedYNDesc : chr [1:9003] "NO" "NO" "NO" "NO" ...
## $ WhereBittenIDDesc: chr [1:9003] "BODY" "BODY" "BODY" "BODY" ...
## $ quarantine_date : POSIXct[1:9003], format: "1985-05-05" "1986-02-12" ...
## $ DispositionIDDesc: chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
## $ head_sent_date : POSIXct[1:9003], format: NA NA ...
## $ release_date : POSIXct[1:9003], format: NA NA ...
## $ ResultsIDDesc : chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
## - attr(*, "spec")=
## .. cols(
## .. bite_date = col_datetime(format = ""),
## .. SpeciesIDDesc = col_character(),
## .. BreedIDDesc = col_character(),
## .. GenderIDDesc = col_character(),
## .. color = col_character(),
## .. vaccination_yrs = col_double(),
## .. vaccination_date = col_datetime(format = ""),
## .. victim_zip = col_double(),
## .. AdvIssuedYNDesc = col_character(),
## .. WhereBittenIDDesc = col_character(),
## .. quarantine_date = col_datetime(format = ""),
## .. DispositionIDDesc = col_character(),
## .. head_sent_date = col_datetime(format = ""),
## .. release_date = col_datetime(format = ""),
## .. ResultsIDDesc = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(HAB)
## bite_date SpeciesIDDesc BreedIDDesc
## Min. :1952-05-28 00:00:00.00 Length:9003 Length:9003
## 1st Qu.:2011-11-01 00:00:00.00 Class :character Class :character
## Median :2013-09-07 00:00:00.00 Mode :character Mode :character
## Mean :2014-02-01 06:55:17.33
## 3rd Qu.:2015-08-16 18:00:00.00
## Max. :5013-07-15 00:00:00.00
## NA's :317
## GenderIDDesc color vaccination_yrs
## Length:9003 Length:9003 Min. : 1.000
## Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Median : 1.000
## Mean : 1.452
## 3rd Qu.: 1.000
## Max. :11.000
## NA's :5265
## vaccination_date victim_zip AdvIssuedYNDesc
## Min. :1985-06-20 00:00:00.00 Min. : 6107 Length:9003
## 1st Qu.:2011-07-05 12:00:00.00 1st Qu.:40211 Class :character
## Median :2013-06-05 00:00:00.00 Median :40218 Mode :character
## Mean :2013-05-17 03:39:03.72 Mean :40437
## 3rd Qu.:2015-04-02 00:00:00.00 3rd Qu.:40243
## Max. :2018-07-21 00:00:00.00 Max. :99674
## NA's :4888 NA's :1841
## WhereBittenIDDesc quarantine_date DispositionIDDesc
## Length:9003 Min. :1985-05-05 00:00:00.00 Length:9003
## Class :character 1st Qu.:2010-06-09 00:00:00.00 Class :character
## Mode :character Median :2010-11-13 12:00:00.00 Mode :character
## Mean :2010-09-08 11:55:00.58
## 3rd Qu.:2011-06-16 00:00:00.00
## Max. :2017-09-01 00:00:00.00
## NA's :6983
## head_sent_date release_date
## Min. :2010-01-10 00:00:00.00 Min. :2011-05-09 00:00:00.00
## 1st Qu.:2012-08-25 12:00:00.00 1st Qu.:2015-11-03 00:00:00.00
## Median :2015-09-08 00:00:00.00 Median :2016-06-13 00:00:00.00
## Mean :2014-11-25 22:47:05.31 Mean :2016-04-17 12:31:23.46
## 3rd Qu.:2016-09-12 12:00:00.00 3rd Qu.:2017-02-22 00:00:00.00
## Max. :2017-09-05 00:00:00.00 Max. :2019-05-19 00:00:00.00
## NA's :8608 NA's :7558
## ResultsIDDesc
## Length:9003
## Class :character
## Mode :character
##
##
##
##
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
HAB<- select(HAB, -BreedIDDesc)
HAB<- select(HAB, -color)
HAB<- select(HAB, -head_sent_date)
HAB<- select(HAB, -release_date)
HAB<- select(HAB, -quarantine_date)
HAB<- select(HAB, -vaccination_date)
HAB<- select(HAB, -bite_date)
HAB<- select(HAB, -WhereBittenIDDesc)
HAB<- select(HAB, -DispositionIDDesc)
HAB<- select(HAB, -AdvIssuedYNDesc)
HAB<- select(HAB, -GenderIDDesc)
HAB$vaccination_yrs[is.na(HAB$vaccination_yrs)] <- median(HAB$vaccination_yrs, na.rm = TRUE)
View(HAB)
str(HAB)
## tibble [9,003 × 4] (S3: tbl_df/tbl/data.frame)
## $ SpeciesIDDesc : chr [1:9003] "DOG" "DOG" "DOG" "DOG" ...
## $ vaccination_yrs: num [1:9003] 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : num [1:9003] 40229 40218 40219 NA NA ...
## $ ResultsIDDesc : chr [1:9003] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
HAB1 <- HAB[!is.na(HAB$ResultsIDDesc) & HAB$ResultsIDDesc != "", ]
str(HAB1)
## tibble [1,543 × 4] (S3: tbl_df/tbl/data.frame)
## $ SpeciesIDDesc : chr [1:1543] "DOG" "DOG" "DOG" "DOG" ...
## $ vaccination_yrs: num [1:1543] 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : num [1:1543] 40229 40218 40219 NA NA ...
## $ ResultsIDDesc : chr [1:1543] "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
HAB1 <- HAB1[HAB1$ResultsIDDesc != "UNKNOWN", ]
str(HAB1)
## tibble [303 × 4] (S3: tbl_df/tbl/data.frame)
## $ SpeciesIDDesc : chr [1:303] "DOG" "DOG" "DOG" "CAT" ...
## $ vaccination_yrs: num [1:303] 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : num [1:303] 40214 40291 40216 40215 40212 ...
## $ ResultsIDDesc : chr [1:303] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
HAB1 <- HAB1[!is.na(HAB1$SpeciesIDDesc) & HAB1$SpeciesIDDesc != "", ]
str(HAB1)
## tibble [300 × 4] (S3: tbl_df/tbl/data.frame)
## $ SpeciesIDDesc : chr [1:300] "DOG" "DOG" "DOG" "CAT" ...
## $ vaccination_yrs: num [1:300] 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : num [1:300] 40214 40291 40216 40215 40212 ...
## $ ResultsIDDesc : chr [1:300] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
HAB1<- HAB1[!is.na(HAB1$victim_zip) & HAB1$victim_zip != "",]
str(HAB1)
## tibble [279 × 4] (S3: tbl_df/tbl/data.frame)
## $ SpeciesIDDesc : chr [1:279] "DOG" "DOG" "DOG" "CAT" ...
## $ vaccination_yrs: num [1:279] 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : num [1:279] 40214 40291 40216 40215 40212 ...
## $ ResultsIDDesc : chr [1:279] "NEGATIVE" "NEGATIVE" "NEGATIVE" "NEGATIVE" ...
View(HAB1)
summary(HAB1)
## SpeciesIDDesc vaccination_yrs victim_zip ResultsIDDesc
## Length:279 Min. :1.000 Min. :40014 Length:279
## Class :character 1st Qu.:1.000 1st Qu.:40207 Class :character
## Mode :character Median :1.000 Median :40214 Mode :character
## Mean :1.014 Mean :40320
## 3rd Qu.:1.000 3rd Qu.:40228
## Max. :3.000 Max. :47150
positive_cases <- HAB1[HAB1$ResultsIDDesc == "POSITIVE", ]
species_counts <- table(positive_cases$SpeciesIDDesc)
species_summary <- as.data.frame(species_counts)
names(species_summary) <- c("Species", "Count")
ggplot(species_summary, aes(x = Species, y = Count, fill = Species)) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Positive Rabies Cases by Species", x = "Species", y = "Number of Positive Cases") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels for clarity

set.seed(123)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
HAB1$victim_zip<- as.numeric(as.factor(HAB1$victim_zip))
HAB1$SpeciesIDDesc <- as.numeric(as.factor(HAB1$SpeciesIDDesc))
data_clustering <- HAB1[c("vaccination_yrs", "victim_zip", "SpeciesIDDesc")]
sapply(data_clustering, is.numeric)
## vaccination_yrs victim_zip SpeciesIDDesc
## TRUE TRUE TRUE
summary(data_clustering)
## vaccination_yrs victim_zip SpeciesIDDesc
## Min. :1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:16.00 1st Qu.:1.000
## Median :1.000 Median :22.00 Median :1.000
## Mean :1.014 Mean :23.78 Mean :1.731
## 3rd Qu.:1.000 3rd Qu.:30.00 3rd Qu.:2.000
## Max. :3.000 Max. :44.00 Max. :5.000
data_scaled <- scale.default(data_clustering)
distance_mat<-dist(data_scaled, method = "euclidean")
hclust_avg<-hclust(distance_mat, method = "average")
cut_age<-cutree(hclust_avg, k=3)
view(cut_age)
plot(cut_age)

HAB1_cl<-mutate(HAB1, cluster=cut_age)
plot(HAB1_cl)

kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
print(kmeans_result$centers)
## vaccination_yrs victim_zip SpeciesIDDesc
## 1 11.74749268 1.115572108 1.2807621
## 2 -0.08481944 -0.051496555 1.5900987
## 3 -0.08481944 0.004472754 -0.4704542
trainControl<-trainControl(method="repeatedcv",number=10,repeats=3)
#CART
set.seed(7)
fit.cart<-train(ResultsIDDesc~.,data=HAB1,method="rpart",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#LDA
set.seed(7)
fit.lda<-train(ResultsIDDesc~.,data=HAB1,method="lda",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#SVM
set.seed(7)
fit.svm<-train(ResultsIDDesc~.,data=HAB1,method="svmRadial",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#KNN
set.seed(7)
fit.knn<-train(ResultsIDDesc~.,data=HAB1,method="knn",trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#rf
set.seed(7)
fit.rf<-train(ResultsIDDesc~.,data=HAB1,method="rf",trControl=trainControl)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
#collect resamples
results<-resamples(list(CART=fit.cart,LDA=fit.lda,SVM=fit.svm,KNN=fit.knn,RF=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: CART, LDA, SVM, KNN, RF
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.9642857 0.9655172 1 0.9859606 1 1 0
## LDA 0.9642857 0.9655172 1 0.9859606 1 1 0
## SVM 0.9642857 0.9655172 1 0.9859606 1 1 0
## KNN 0.9642857 0.9655172 1 0.9859606 1 1 0
## RF 0.9642857 0.9655172 1 0.9859606 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0 0 0 0 0 0 18
## LDA 0 0 0 0 0 0 18
## SVM 0 0 0 0 0 0 18
## KNN 0 0 0 0 0 0 18
## RF 0 0 0 0 0 0 18
prop.table(table(HAB1$ResultsIDDesc))
##
## NEGATIVE POSITIVE
## 0.98566308 0.01433692
#Box and whisker plot
scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)

#density plot
densityplot(results, scales=scales,pch="|")

#dot plot
dotplot(results,scales=scales)

splom(results)

#difference in plots
diffs<-diff(results)
summary(diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## CART LDA SVM KNN RF
## CART 0 0 0 0
## LDA NA 0 0 0
## SVM NA NA 0 0
## KNN NA NA NA 0
## RF NA NA NA NA
##
## Kappa
## CART LDA SVM KNN RF
## CART 0 0 0 0
## LDA NA 0 0 0
## SVM NA NA 0 0
## KNN NA NA NA 0
## RF NA NA NA NA