#read the two data files from the github I created
tbdata1 <- read.csv("https://raw.githubusercontent.com/jeffnieman11/tb_database/master/tb(1).csv", header=FALSE)
popdata <- read.csv("https://raw.githubusercontent.com/jeffnieman11/tb_database/master/population.csv")
head(tbdata1)
## V1 V2 V3 V4 V5 V6
## 1 Afghanistan 1995 female -1 -1 -1
## 2 Afghanistan 1995 male -1 -1 -1
## 3 Afghanistan 1996 female -1 -1 -1
## 4 Afghanistan 1996 male -1 -1 -1
## 5 Afghanistan 1997 female 5 96 1
## 6 Afghanistan 1997 male 0 26 0
head(popdata)
## country year population
## 1 Afghanistan 1995 17586073
## 2 Algeria 1995 29315463
## 3 Angola 1995 12104952
## 4 Argentina 1995 34833168
## 5 Azerbaijan 1995 7770806
## 6 Bangladesh 1995 119869585
#create column headers
colnames(tbdata1) <-c("Country", "Year", "Sex", "Child", "Adult", "Elderly")
#correct the -1 in the data frame
tbdata1$Child <- gsub(-1,0,tbdata1$Child)
tbdata1$Adult <- gsub(-1,0,tbdata1$Adult)
tbdata1$Elderly <- gsub(-1,0,tbdata1$Elderly)
head(tbdata1)
## Country Year Sex Child Adult Elderly
## 1 Afghanistan 1995 female 0 0 0
## 2 Afghanistan 1995 male 0 0 0
## 3 Afghanistan 1996 female 0 0 0
## 4 Afghanistan 1996 male 0 0 0
## 5 Afghanistan 1997 female 5 96 1
## 6 Afghanistan 1997 male 0 26 0
#sum up the cases
tbdata1$Child <- as.numeric(tbdata1$Child)
tbdata1$Adult <- as.numeric(tbdata1$Adult)
tbdata1$Elderly <- as.numeric(tbdata1$Elderly)
tbdata1$Cases <- (tbdata1$Child + tbdata1$Adult + tbdata1$Elderly)
head(tbdata1)
## Country Year Sex Child Adult Elderly Cases
## 1 Afghanistan 1995 female 0 0 0 0
## 2 Afghanistan 1995 male 0 0 0 0
## 3 Afghanistan 1996 female 0 0 0 0
## 4 Afghanistan 1996 male 0 0 0 0
## 5 Afghanistan 1997 female 5 96 1 102
## 6 Afghanistan 1997 male 0 26 0 26
#create subsets for males and females
tbdatamale <- subset(tbdata1, tbdata1$Sex=="male")
head(tbdatamale)
## Country Year Sex Child Adult Elderly Cases
## 2 Afghanistan 1995 male 0 0 0 0
## 4 Afghanistan 1996 male 0 0 0 0
## 6 Afghanistan 1997 male 0 26 0 26
## 8 Afghanistan 1998 male 30 500 41 571
## 10 Afghanistan 1999 male 8 212 8 228
## 12 Afghanistan 2000 male 52 783 80 915
tbdatafemale <- subset(tbdata1, tbdata1$Sex == "female")
head(tbdatafemale)
## Country Year Sex Child Adult Elderly Cases
## 1 Afghanistan 1995 female 0 0 0 0
## 3 Afghanistan 1996 female 0 0 0 0
## 5 Afghanistan 1997 female 5 96 1 102
## 7 Afghanistan 1998 female 45 1142 20 1207
## 9 Afghanistan 1999 female 25 484 8 517
## 11 Afghanistan 2000 female 93 1622 36 1751
#join male and female subset when year and country match and sum up all cases
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.3
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.3
## Loading required package: DBI
tbdata2 <- sqldf("select tbdatamale.Country, tbdatamale.Year, tbdatamale.Cases as 'malecases', tbdatafemale.Cases as 'femalecases' from tbdatamale inner join tbdatafemale on tbdatamale.Country = tbdatafemale.Country and tbdatamale.Year = tbdatafemale.Year")
## Loading required package: tcltk
tbdata2$malecases <- as.numeric(tbdata2$malecases)
tbdata2$femalecases <- as.numeric(tbdata2$femalecases)
tbdata2$Cases <- tbdata2$malecases + tbdata2$femalecases
head(tbdata2)
## Country Year malecases femalecases Cases
## 1 Afghanistan 1995 0 0 0
## 2 Afghanistan 1996 0 0 0
## 3 Afghanistan 1997 26 102 128
## 4 Afghanistan 1998 571 1207 1778
## 5 Afghanistan 1999 228 517 745
## 6 Afghanistan 2000 915 1751 2666
#join with the population by year and country
tbpopjoin <- sqldf("select tbdata2.Country, tbdata2.Year, Cases, population from tbdata2 inner join popdata on tbdata2.Country = popdata.country and tbdata2.Year = popdata.year")
head(tbpopjoin)
## Country Year Cases population
## 1 Afghanistan 1995 0 17586073
## 2 Afghanistan 1996 0 18415307
## 3 Afghanistan 1997 128 19021226
## 4 Afghanistan 1998 1778 19496836
## 5 Afghanistan 1999 745 19987071
## 6 Afghanistan 2000 2666 20595360
#create rate value
tbpopjoin$Rate <- tbpopjoin$Cases/tbpopjoin$population
head(tbpopjoin)
## Country Year Cases population Rate
## 1 Afghanistan 1995 0 17586073 0.000000e+00
## 2 Afghanistan 1996 0 18415307 0.000000e+00
## 3 Afghanistan 1997 128 19021226 6.729324e-06
## 4 Afghanistan 1998 1778 19496836 9.119428e-05
## 5 Afghanistan 1999 745 19987071 3.727410e-05
## 6 Afghanistan 2000 2666 20595360 1.294466e-04
#eliminate extra columns and create the R data frame with the proper data
tbanswer <- tbpopjoin[,c(1,2,5)]
head(tbanswer)
## Country Year Rate
## 1 Afghanistan 1995 0.000000e+00
## 2 Afghanistan 1996 0.000000e+00
## 3 Afghanistan 1997 6.729324e-06
## 4 Afghanistan 1998 9.119428e-05
## 5 Afghanistan 1999 3.727410e-05
## 6 Afghanistan 2000 1.294466e-04