#read the two data files from the github I created
tbdata1 <- read.csv("https://raw.githubusercontent.com/jeffnieman11/tb_database/master/tb(1).csv", header=FALSE)
popdata <- read.csv("https://raw.githubusercontent.com/jeffnieman11/tb_database/master/population.csv")
head(tbdata1)
##            V1   V2     V3 V4 V5 V6
## 1 Afghanistan 1995 female -1 -1 -1
## 2 Afghanistan 1995   male -1 -1 -1
## 3 Afghanistan 1996 female -1 -1 -1
## 4 Afghanistan 1996   male -1 -1 -1
## 5 Afghanistan 1997 female  5 96  1
## 6 Afghanistan 1997   male  0 26  0
head(popdata)
##       country year population
## 1 Afghanistan 1995   17586073
## 2     Algeria 1995   29315463
## 3      Angola 1995   12104952
## 4   Argentina 1995   34833168
## 5  Azerbaijan 1995    7770806
## 6  Bangladesh 1995  119869585
#create column headers
colnames(tbdata1) <-c("Country", "Year", "Sex", "Child", "Adult", "Elderly")

#correct the -1 in the data frame
tbdata1$Child <- gsub(-1,0,tbdata1$Child)
tbdata1$Adult <- gsub(-1,0,tbdata1$Adult)
tbdata1$Elderly <- gsub(-1,0,tbdata1$Elderly)
head(tbdata1)
##       Country Year    Sex Child Adult Elderly
## 1 Afghanistan 1995 female     0     0       0
## 2 Afghanistan 1995   male     0     0       0
## 3 Afghanistan 1996 female     0     0       0
## 4 Afghanistan 1996   male     0     0       0
## 5 Afghanistan 1997 female     5    96       1
## 6 Afghanistan 1997   male     0    26       0
#sum up the cases
tbdata1$Child <- as.numeric(tbdata1$Child)
tbdata1$Adult <- as.numeric(tbdata1$Adult)
tbdata1$Elderly <- as.numeric(tbdata1$Elderly)
tbdata1$Cases <- (tbdata1$Child + tbdata1$Adult + tbdata1$Elderly)
head(tbdata1)
##       Country Year    Sex Child Adult Elderly Cases
## 1 Afghanistan 1995 female     0     0       0     0
## 2 Afghanistan 1995   male     0     0       0     0
## 3 Afghanistan 1996 female     0     0       0     0
## 4 Afghanistan 1996   male     0     0       0     0
## 5 Afghanistan 1997 female     5    96       1   102
## 6 Afghanistan 1997   male     0    26       0    26
#create subsets for males and females
tbdatamale <- subset(tbdata1, tbdata1$Sex=="male")
head(tbdatamale)
##        Country Year  Sex Child Adult Elderly Cases
## 2  Afghanistan 1995 male     0     0       0     0
## 4  Afghanistan 1996 male     0     0       0     0
## 6  Afghanistan 1997 male     0    26       0    26
## 8  Afghanistan 1998 male    30   500      41   571
## 10 Afghanistan 1999 male     8   212       8   228
## 12 Afghanistan 2000 male    52   783      80   915
tbdatafemale <- subset(tbdata1, tbdata1$Sex == "female")
head(tbdatafemale)
##        Country Year    Sex Child Adult Elderly Cases
## 1  Afghanistan 1995 female     0     0       0     0
## 3  Afghanistan 1996 female     0     0       0     0
## 5  Afghanistan 1997 female     5    96       1   102
## 7  Afghanistan 1998 female    45  1142      20  1207
## 9  Afghanistan 1999 female    25   484       8   517
## 11 Afghanistan 2000 female    93  1622      36  1751
#join male and female subset when year and country match and sum up all cases
library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.3
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.3
## Loading required package: DBI
tbdata2 <- sqldf("select tbdatamale.Country, tbdatamale.Year, tbdatamale.Cases as 'malecases', tbdatafemale.Cases as 'femalecases' from tbdatamale inner join tbdatafemale on tbdatamale.Country = tbdatafemale.Country and tbdatamale.Year = tbdatafemale.Year")
## Loading required package: tcltk
tbdata2$malecases <- as.numeric(tbdata2$malecases)
tbdata2$femalecases <- as.numeric(tbdata2$femalecases)
tbdata2$Cases <- tbdata2$malecases + tbdata2$femalecases
head(tbdata2)
##       Country Year malecases femalecases Cases
## 1 Afghanistan 1995         0           0     0
## 2 Afghanistan 1996         0           0     0
## 3 Afghanistan 1997        26         102   128
## 4 Afghanistan 1998       571        1207  1778
## 5 Afghanistan 1999       228         517   745
## 6 Afghanistan 2000       915        1751  2666
#join with the population by year and country
tbpopjoin <- sqldf("select tbdata2.Country, tbdata2.Year, Cases, population from tbdata2 inner join popdata on tbdata2.Country = popdata.country and tbdata2.Year = popdata.year")
head(tbpopjoin)
##       Country Year Cases population
## 1 Afghanistan 1995     0   17586073
## 2 Afghanistan 1996     0   18415307
## 3 Afghanistan 1997   128   19021226
## 4 Afghanistan 1998  1778   19496836
## 5 Afghanistan 1999   745   19987071
## 6 Afghanistan 2000  2666   20595360
#create rate value
tbpopjoin$Rate <- tbpopjoin$Cases/tbpopjoin$population
head(tbpopjoin)
##       Country Year Cases population         Rate
## 1 Afghanistan 1995     0   17586073 0.000000e+00
## 2 Afghanistan 1996     0   18415307 0.000000e+00
## 3 Afghanistan 1997   128   19021226 6.729324e-06
## 4 Afghanistan 1998  1778   19496836 9.119428e-05
## 5 Afghanistan 1999   745   19987071 3.727410e-05
## 6 Afghanistan 2000  2666   20595360 1.294466e-04
#eliminate extra columns and create the R data frame with the proper data
tbanswer <- tbpopjoin[,c(1,2,5)]
head(tbanswer)
##       Country Year         Rate
## 1 Afghanistan 1995 0.000000e+00
## 2 Afghanistan 1996 0.000000e+00
## 3 Afghanistan 1997 6.729324e-06
## 4 Afghanistan 1998 9.119428e-05
## 5 Afghanistan 1999 3.727410e-05
## 6 Afghanistan 2000 1.294466e-04