Project Timeline : 1 Days
library(MASS)
library(dplyr)
aids_df <- data.frame(Aids2) # Convert to df
head(aids_df, 5)
state Grouped state of origin: “NSW”includes ACT and
“other” is WA, SA, NT and TAS.
Sex of patient.
diag (Julian) date of diagnosis.
death (Julian) date of death or end of
observation.
status “A” (alive) or “D” (dead) at end of
observation.
T.categ Reported transmission category.
age (years) at diagnosis
aids_df$death <- as.Date(aids_df$death, origin = "1970-01-01")
aids_df$diag <- as.Date(aids_df$diag, origin = "1970-01-01")
# Number of Unique levels :
lapply(as.list(aids_df), function(x){length(unique(x))})
## $state
## [1] 4
##
## $sex
## [1] 2
##
## $diag
## [1] 1580
##
## $death
## [1] 1148
##
## $status
## [1] 2
##
## $T.categ
## [1] 8
##
## $age
## [1] 74
aids_df |> dplyr::select(where(is.numeric)) |> colnames()
## [1] "age"
aids_df |> dplyr::select(!where(is.numeric)) |> colnames()
## [1] "state" "sex" "diag" "death" "status" "T.categ"
state :
Sex :
diag :
death :
status :
T.categ :
aids_df[1,]
Question : How many people are died/alive in the data?
aids_df |>
count(status) |>
mutate(pct = round(n/2843, 2))
Observation :
T.categ_sum <-
aids_df |>
count(T.categ) |>
mutate(pct = round(n/2843, 2)) |>
arrange(desc(n))
T.categ_sum
barplot(T.categ_sum$n, names.arg = T.categ_sum$T.categ)
Observations :
hsaids_df |> filter(status == "D") |> count(T.categ); rm(T.categ_sum) # remove T.categ_sum