# Working with country TB cases data file

Author: Arindam Barman Date: February 12, 2016

# Data Preparation in MYSQL

Load file in MYSQL

Create the tb database by running table create query and load the tb csv file

select * from tb; desc tb;

Select count(distinct year ) from tb;

Data Cleaning-

UPDATE tb SET child=0 WHERE child IS NULL ;

UPDATE tb SET adult=0 WHERE adult IS NULL ;

UPDATE tb SET elderly=0 WHERE elderly IS NULL ;

Creating a new column total_cases-

ALTER TABLE tb ADD total_cases integer(15);

update tb set total_cases = (child+adult+ elderly);

Extract file Country_tb_cases for R

select country,year,sum(child),sum(adult),sum(elderly),sum(total_cases) from tb group by year,country;

country_population<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/population.csv",header=TRUE, stringsAsFactors = FALSE)

country_tb_cases<-read.csv("C:/Users/Arindam/Documents/Data Science/Cuny/Data 607/Assignments/Country tb cases.csv",header=TRUE, stringsAsFactors = FALSE)

country_tb_yr<-merge(country_population,country_tb_cases,by.country_population="country",by.country_tb_cases="country",by.country_population="year",by.country_tb_cases="year")

country_tb_yr$tb_cases<-(country_tb_yr$sum.total_cases)/(country_tb_yr$population)*100

country_yr_tbcases<-country_tb_yr[ ,c("country","year","tb_cases")]

str(country_yr_tbcases)
## 'data.frame':    1900 obs. of  3 variables:
##  $ country : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year    : int  1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 ...
##  $ tb_cases: num  0 0 0.000673 0.009119 0.003727 ...
summary(country_yr_tbcases)
##    country               year         tb_cases      
##  Length:1900        Min.   :1995   Min.   :0.00000  
##  Class :character   1st Qu.:1999   1st Qu.:0.01279  
##  Mode  :character   Median :2004   Median :0.03210  
##                     Mean   :2004   Mean   :0.05416  
##                     3rd Qu.:2009   3rd Qu.:0.06780  
##                     Max.   :2013   Max.   :0.81348
head(country_yr_tbcases)
##       country year     tb_cases
## 1 Afghanistan 1995 0.0000000000
## 2 Afghanistan 1996 0.0000000000
## 3 Afghanistan 1997 0.0006729324
## 4 Afghanistan 1998 0.0091194284
## 5 Afghanistan 1999 0.0037274096
## 6 Afghanistan 2000 0.0129446633
unique(country_yr_tbcases$year)
##  [1] 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
## [15] 2009 2010 2011 2012 2013
# Find out country with maximun and minimum cases above sero
Country_tb_2013<-subset(country_yr_tbcases,country_yr_tbcases$year==2013 & country_yr_tbcases$tb_cases >0)



Country_tb_2013[which.max(Country_tb_2013$tb_cases),]
##           country year  tb_cases
## 1520 South Africa 2013 0.5918964
Country_tb_2013[which.min(Country_tb_2013$tb_cases),]
##      country year    tb_cases
## 304 Cameroon 2013 0.001307632
plot(country_yr_tbcases$year[country_yr_tbcases$country=="South Africa"],country_yr_tbcases$tb_cases[country_yr_tbcases$country=="South Africa"], xlab = 'South Africa',ylab='percent tb cases')

plot(country_yr_tbcases$year[country_yr_tbcases$country=="Cameroon"],country_yr_tbcases$tb_cases[country_yr_tbcases$country=="Cameroon"], xlab = 'Cameroon',ylab='percent tb cases')