Lab 2-3
Combining data from multiple sources
library(dplyr)
library(sqldf)
library(RODBC)
library(DBI)
library(odbc)
library(downloader)
library(htmlTable)
Connect to the MySQL database
Number of rows from MySQL
mydb = dbConnect(odbc(), "MySQL")
mytb <- dbGetQuery(mydb, "select country, year, sex, child, adult,elderly from tb")
nrow(mytb)
## [1] 3800
Filter data using Query
Group the data based on Country and Year
library(sqldf)
missing <- sqldf( "SELECT country, year, sex, child + adult + elderly as cases FROM mytb WHERE child > -1", row.names=TRUE)
head(missing)
## country year sex cases
## 1 Afghanistan 1997 female 102
## 2 Afghanistan 1997 male 26
## 3 Afghanistan 1998 female 1207
## 4 Afghanistan 1998 male 571
## 5 Afghanistan 1999 female 517
## 6 Afghanistan 1999 male 228
missing_groupBy <- sqldf("select country, year,
sum(cases) as ncases
from missing
group by country, year
")
head(missing_groupBy)
## country year ncases
## 1 Afghanistan 1997 128
## 2 Afghanistan 1998 1778
## 3 Afghanistan 1999 745
## 4 Afghanistan 2000 2666
## 5 Afghanistan 2001 4639
## 6 Afghanistan 2002 6509
Get data from csv file
Summarize population
summary(population)
## country year population
## Length:1900 Min. :1995 Min. :6.640e+05
## Class :character 1st Qu.:1999 1st Qu.:8.843e+06
## Mode :character Median :2004 Median :1.960e+07
## Mean :2004 Mean :6.084e+07
## 3rd Qu.:2009 3rd Qu.:4.756e+07
## Max. :2013 Max. :1.386e+09
Join two data sources
final_df <-
inner_join(missing_groupBy, population, by=c("country", "year")) %>%
mutate(caseRatio = round( ((ncases/population)*100), digits = 4) )
htmlTable(head(arrange(final_df, country, year), 20))
|
|
country
|
year
|
ncases
|
population
|
caseRatio
|
|
1
|
Afghanistan
|
1997
|
128
|
19021226
|
7e-04
|
|
2
|
Afghanistan
|
1998
|
1778
|
19496836
|
0.0091
|
|
3
|
Afghanistan
|
1999
|
745
|
19987071
|
0.0037
|
|
4
|
Afghanistan
|
2000
|
2666
|
20595360
|
0.0129
|
|
5
|
Afghanistan
|
2001
|
4639
|
21347782
|
0.0217
|
|
6
|
Afghanistan
|
2002
|
6509
|
22202806
|
0.0293
|
|
7
|
Afghanistan
|
2003
|
6528
|
23116142
|
0.0282
|
|
8
|
Afghanistan
|
2004
|
8245
|
24018682
|
0.0343
|
|
9
|
Afghanistan
|
2005
|
9949
|
24860855
|
0.04
|
|
10
|
Afghanistan
|
2006
|
12469
|
25631282
|
0.0486
|
|
11
|
Afghanistan
|
2007
|
13213
|
26349243
|
0.0501
|
|
12
|
Afghanistan
|
2008
|
13136
|
27032197
|
0.0486
|
|
13
|
Afghanistan
|
2009
|
12503
|
27708187
|
0.0451
|
|
14
|
Afghanistan
|
2010
|
12947
|
28397812
|
0.0456
|
|
15
|
Afghanistan
|
2011
|
15542
|
29105480
|
0.0534
|
|
16
|
Afghanistan
|
2012
|
15774
|
29824536
|
0.0529
|
|
17
|
Afghanistan
|
2013
|
3450
|
30551674
|
0.0113
|
|
18
|
Algeria
|
1997
|
8902
|
30345466
|
0.0293
|
|
19
|
Algeria
|
1999
|
7088
|
31276295
|
0.0227
|
|
20
|
Algeria
|
2000
|
8050
|
31719449
|
0.0254
|