Lab 2-3

Combining data from multiple sources

library(dplyr)
library(sqldf)
library(RODBC)
library(DBI)
library(odbc)

library(downloader)
library(htmlTable)


Connect to the MySQL database

Number of rows from MySQL

mydb = dbConnect(odbc(), "MySQL")
                
mytb <- dbGetQuery(mydb, "select country, year, sex, child, adult,elderly  from tb")
nrow(mytb)
## [1] 3800


Filter data using Query

Group the data based on Country and Year

library(sqldf)

missing <- sqldf( "SELECT country, year, sex, child + adult + elderly as cases FROM mytb WHERE child > -1", row.names=TRUE)

head(missing)
##       country year    sex cases
## 1 Afghanistan 1997 female   102
## 2 Afghanistan 1997   male    26
## 3 Afghanistan 1998 female  1207
## 4 Afghanistan 1998   male   571
## 5 Afghanistan 1999 female   517
## 6 Afghanistan 1999   male   228
missing_groupBy <- sqldf("select country, year,
                         sum(cases) as ncases
                    from missing 
                   group by country, year
                   ")

head(missing_groupBy)
##       country year ncases
## 1 Afghanistan 1997    128
## 2 Afghanistan 1998   1778
## 3 Afghanistan 1999    745
## 4 Afghanistan 2000   2666
## 5 Afghanistan 2001   4639
## 6 Afghanistan 2002   6509


Get data from csv file

Summarize population

summary(population)
##    country               year        population       
##  Length:1900        Min.   :1995   Min.   :6.640e+05  
##  Class :character   1st Qu.:1999   1st Qu.:8.843e+06  
##  Mode  :character   Median :2004   Median :1.960e+07  
##                     Mean   :2004   Mean   :6.084e+07  
##                     3rd Qu.:2009   3rd Qu.:4.756e+07  
##                     Max.   :2013   Max.   :1.386e+09


Join two data sources

final_df <- 
        inner_join(missing_groupBy, population, by=c("country", "year")) %>%   
        mutate(caseRatio = round( ((ncases/population)*100), digits = 4) )                     
htmlTable(head(arrange(final_df, country, year), 20))
country year ncases population caseRatio
1 Afghanistan 1997 128 19021226 7e-04
2 Afghanistan 1998 1778 19496836 0.0091
3 Afghanistan 1999 745 19987071 0.0037
4 Afghanistan 2000 2666 20595360 0.0129
5 Afghanistan 2001 4639 21347782 0.0217
6 Afghanistan 2002 6509 22202806 0.0293
7 Afghanistan 2003 6528 23116142 0.0282
8 Afghanistan 2004 8245 24018682 0.0343
9 Afghanistan 2005 9949 24860855 0.04
10 Afghanistan 2006 12469 25631282 0.0486
11 Afghanistan 2007 13213 26349243 0.0501
12 Afghanistan 2008 13136 27032197 0.0486
13 Afghanistan 2009 12503 27708187 0.0451
14 Afghanistan 2010 12947 28397812 0.0456
15 Afghanistan 2011 15542 29105480 0.0534
16 Afghanistan 2012 15774 29824536 0.0529
17 Afghanistan 2013 3450 30551674 0.0113
18 Algeria 1997 8902 30345466 0.0293
19 Algeria 1999 7088 31276295 0.0227
20 Algeria 2000 8050 31719449 0.0254