HAB <- read.csv("C:/Users/Ozili Nwokobia/OneDrive/Desktop/Health_AnimalBites.csv", stringsAsFactors=TRUE)
View(HAB)
str(HAB)
## 'data.frame': 9003 obs. of 15 variables:
## $ bite_date : Factor w/ 2703 levels "","1952-05-28 00:00:00",..: 4 5 6 7 8 9 10 11 12 13 ...
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ BreedIDDesc : Factor w/ 102 levels "","AAUST. TERR.",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GenderIDDesc : Factor w/ 4 levels "","FEMALE","MALE",..: 2 4 4 3 2 4 2 3 3 4 ...
## $ color : Factor w/ 714 levels "","APRICOT","B.MERLE-GR",..: 412 244 1 17 72 1 39 72 250 201 ...
## $ vaccination_yrs : int 1 NA NA NA NA NA 1 NA NA NA ...
## $ vaccination_date : Factor w/ 2108 levels "","1985-06-20 00:00:00",..: 2 1 1 1 1 1 3 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 109 101 102 1 1 94 86 1 1 1 ...
## $ AdvIssuedYNDesc : Factor w/ 3 levels "","NO","YES": 2 2 2 2 2 2 2 2 2 2 ...
## $ WhereBittenIDDesc: Factor w/ 4 levels "","BODY","HEAD",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ quarantine_date : Factor w/ 603 levels "","1985-05-05 00:00:00",..: 2 3 6 10 1 1 4 5 7 8 ...
## $ DispositionIDDesc: Factor w/ 5 levels "","DIED","KILLED",..: 5 5 5 5 5 5 5 4 5 4 ...
## $ head_sent_date : Factor w/ 326 levels "","2010-01-10 00:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ release_date : Factor w/ 583 levels "","2011-05-09 00:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 4 4 4 4 4 4 4 4 4 4 ...
summary(HAB)
## bite_date SpeciesIDDesc BreedIDDesc
## : 317 DOG :7029 :5244
## 2017-08-15 00:00:00: 13 CAT :1568 PIT BULL :1102
## 2010-04-01 00:00:00: 12 BAT : 237 GERM SHEPHERD : 327
## 2014-04-26 00:00:00: 12 : 118 LABRADOR RETRIV: 253
## 2015-03-17 00:00:00: 12 RACCOON: 27 BOXER : 181
## 2011-08-26 00:00:00: 11 OTHER : 11 CHICHAUHUA : 165
## (Other) :8626 (Other): 13 (Other) :1731
## GenderIDDesc color vaccination_yrs vaccination_date
## :2526 :2576 Min. : 1.000 :4888
## FEMALE :2016 BLACK : 671 1st Qu.: 1.000 2015-10-01 00:00:00: 8
## MALE :3832 BROWN : 433 Median : 1.000 2011-07-08 00:00:00: 7
## UNKNOWN: 629 WHITE : 394 Mean : 1.452 2012-05-14 00:00:00: 7
## BLK WHT: 277 3rd Qu.: 1.000 2013-04-01 00:00:00: 7
## TAN : 223 Max. :11.000 2013-05-08 00:00:00: 7
## (Other):4429 NA's :5265 (Other) :4079
## victim_zip AdvIssuedYNDesc WhereBittenIDDesc quarantine_date
## :1838 :6438 : 616 :6983
## 40272 : 376 NO :1914 BODY :6213 2010-04-05 00:00:00: 18
## 40291 : 368 YES: 651 HEAD :1244 2010-11-08 00:00:00: 14
## 40216 : 362 UNKNOWN: 930 2011-06-01 00:00:00: 14
## 40215 : 356 2010-02-01 00:00:00: 13
## 40214 : 348 2011-06-27 00:00:00: 13
## (Other):5355 (Other) :1948
## DispositionIDDesc head_sent_date release_date
## :7468 :8608 :7558
## DIED : 4 2010-05-21 00:00:00: 4 2016-07-05 00:00:00: 13
## KILLED : 16 2010-07-12 00:00:00: 3 2017-09-05 00:00:00: 12
## RELEASED: 912 2015-06-02 00:00:00: 3 2016-04-04 00:00:00: 11
## UNKNOWN : 603 2015-06-16 00:00:00: 3 2016-05-23 00:00:00: 9
## 2015-08-17 00:00:00: 3 2017-05-08 00:00:00: 9
## (Other) : 379 (Other) :1391
## ResultsIDDesc
## :7460
## NEGATIVE: 299
## POSITIVE: 4
## UNKNOWN :1240
##
##
##
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#CLEANING THE DATA
HAB<- select(HAB, -BreedIDDesc)
HAB<- select(HAB, -color)
HAB<- select(HAB, -head_sent_date)
HAB<- select(HAB, -release_date)
HAB<- select(HAB, -quarantine_date)
HAB<- select(HAB, -vaccination_date)
HAB<- select(HAB, -bite_date)
HAB<- select(HAB, -WhereBittenIDDesc)
HAB<- select(HAB, -DispositionIDDesc)
HAB<- select(HAB, -AdvIssuedYNDesc)
HAB<- select(HAB, -GenderIDDesc)
HAB$vaccination_yrs[is.na(HAB$vaccination_yrs)] <- median(HAB$vaccination_yrs, na.rm = TRUE)
View(HAB)
str(HAB)
## 'data.frame': 9003 obs. of 4 variables:
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ vaccination_yrs: num 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 109 101 102 1 1 94 86 1 1 1 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 4 4 4 4 4 4 4 4 4 4 ...
#ELIMINATING THE N/As or EMPTYIES IN RESULTS TABLE
HAB1 <- HAB[!is.na(HAB$ResultsIDDesc) & HAB$ResultsIDDesc != "", ]
str(HAB1)
## 'data.frame': 1543 obs. of 4 variables:
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ vaccination_yrs: num 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 109 101 102 1 1 94 86 1 1 1 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 4 4 4 4 4 4 4 4 4 4 ...
HAB1 <- HAB1[HAB1$ResultsIDDesc != "UNKNOWN", ]
str(HAB1)
## 'data.frame': 303 obs. of 4 variables:
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 3 3 2 2 2 9 4 ...
## $ vaccination_yrs: num 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 97 123 99 98 95 117 102 124 117 109 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 2 2 2 2 2 2 2 2 2 2 ...
HAB1 <- HAB1[!is.na(HAB1$SpeciesIDDesc) & HAB1$SpeciesIDDesc != "", ]
str(HAB1)
## 'data.frame': 300 obs. of 4 variables:
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 3 3 2 2 2 9 4 ...
## $ vaccination_yrs: num 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 97 123 99 98 95 117 102 124 117 109 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 2 2 2 2 2 2 2 2 2 2 ...
HAB1<- HAB1[!is.na(HAB1$victim_zip) & HAB1$victim_zip != "",]
str(HAB1)
## 'data.frame': 279 obs. of 4 variables:
## $ SpeciesIDDesc : Factor w/ 10 levels "","BAT","CAT",..: 4 4 4 3 3 2 2 2 9 4 ...
## $ vaccination_yrs: num 1 1 1 1 1 1 1 1 1 1 ...
## $ victim_zip : Factor w/ 234 levels "","11215","13440",..: 97 123 99 98 95 117 102 124 117 109 ...
## $ ResultsIDDesc : Factor w/ 4 levels "","NEGATIVE",..: 2 2 2 2 2 2 2 2 2 2 ...
View(HAB1)
summary(HAB1)
## SpeciesIDDesc vaccination_yrs victim_zip ResultsIDDesc
## BAT :158 Min. :1.000 40207 : 17 : 0
## CAT : 57 1st Qu.:1.000 40211 : 17 NEGATIVE:275
## DOG : 52 Median :1.000 40205 : 16 POSITIVE: 4
## RACCOON: 7 Mean :1.014 40214 : 16 UNKNOWN : 0
## OTHER : 5 3rd Qu.:1.000 40223 : 16
## : 0 Max. :3.000 40299 : 16
## (Other): 0 (Other):181
positive_cases <- HAB1[HAB1$ResultsIDDesc == "POSITIVE", ]
species_counts <- table(positive_cases$SpeciesIDDesc)
species_summary <- as.data.frame(species_counts)
names(species_summary) <- c("Species", "Count")
ggplot(species_summary, aes(x = Species, y = Count, fill = Species)) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Positive Rabies Cases by Species", x = "Species", y = "Number of Positive Cases") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels for clarity

set.seed(123)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Loading required package: lattice
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
HAB1$victim_zip<- as.numeric(as.factor(HAB1$victim_zip))
HAB1$SpeciesIDDesc <- as.numeric(as.factor(HAB1$SpeciesIDDesc))
data_clustering <- HAB1[c("vaccination_yrs", "victim_zip", "SpeciesIDDesc")]
sapply(data_clustering, is.numeric)
## vaccination_yrs victim_zip SpeciesIDDesc
## TRUE TRUE TRUE
summary(data_clustering)
## vaccination_yrs victim_zip SpeciesIDDesc
## Min. :1.000 Min. : 43.0 Min. :2.000
## 1st Qu.:1.000 1st Qu.: 90.0 1st Qu.:2.000
## Median :1.000 Median : 97.0 Median :2.000
## Mean :1.014 Mean :100.2 Mean :2.842
## 3rd Qu.:1.000 3rd Qu.:108.0 3rd Qu.:3.000
## Max. :3.000 Max. :192.0 Max. :9.000
data_scaled <- scale.default(data_clustering)
distance_mat<-dist(data_scaled, method = "euclidean")
hclust_avg<-hclust(distance_mat, method = "average")
cut_age<-cutree(hclust_avg, k=3)
view(cut_age)
plot(cut_age)

HAB1_cl<-mutate(HAB1, cluster=cut_age)
plot(HAB1_cl)

kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
print(kmeans_result$centers)
## vaccination_yrs victim_zip SpeciesIDDesc
## 1 -0.08481944 -0.008170624 3.8378609
## 2 -0.08481944 -0.005921326 -0.1800879
## 3 11.74749268 0.833599390 0.8344861