This is the solution to IS607 (Week 3) assignment. Thanks for your time.
#Before we proceed, lets install and load the packages below for easy accesibility to SQL.
#install.packages("RMySQL");
#install.packages("DBI");
library(DBI);
library("RMySQL");
# NOTE: my username = "root", password = "oracle", database_name = "diseases", and host = "localhost". You may be required to change these to your local or server access details.
disease = dbConnect(MySQL(), user='root', password='oracle', dbname='DISEASES', host='localhost');
# viewing the table lists in the database.
dbListTables(disease);
## [1] "population" "tb" "tb_population"
# Query from MySQL through R.
disease2 = dbSendQuery(disease, "SELECT T.SEX, T.CASE1, T.CASE2
FROM TB T
LEFT JOIN TB_POPULATION TP ON TP.COUNTRY = T.COUNTRY
LEFT JOIN POPULATION P ON TP.YEARS = P.YEARS
ORDER BY T.SEX, T.CASE1;");
# Viewing the sql query
disease3 = fetch(disease2, n=-1);
View(disease3);
# Now on R and Loading our databases (tb & population) from my local directory.
library(plyr);
tb1 = read.csv("C:/Data/tb.csv", head = TRUE, sep = ",");
population = read.csv("C:/Data/population.csv", head = TRUE, sep = ",");
# Renaming the tb column names
tb=rename(tb1, c("Afghanistan"="country", "X1995"="year", "female"="sex", "X.1"="case1", "X.1.1"="case2", "X.1.2"="case3"));
# A glance at the output after renaming.
head(tb);
## country year sex case1 case2 case3
## 1 Afghanistan 1995 male -1 -1 -1
## 2 Afghanistan 1996 female -1 -1 -1
## 3 Afghanistan 1996 male -1 -1 -1
## 4 Afghanistan 1997 female 5 96 1
## 5 Afghanistan 1997 male 0 26 0
## 6 Afghanistan 1998 female 45 1142 20
head(population);
## country year population
## 1 Afghanistan 1995 17586073
## 2 Algeria 1995 29315463
## 3 Angola 1995 12104952
## 4 Argentina 1995 34833168
## 5 Azerbaijan 1995 7770806
## 6 Bangladesh 1995 119869585
# Merging the databases together
diseases5 <- merge(tb, population, by =c("country", "year"));
# obtaining the column summation.
sum_all <- cbind(diseases5$case1+diseases5$case2+diseases5$case3);
# cases/population as asked in the assignment!
rate1 <- cbind(sum_all/diseases5$population);
# Combing resulting data with the both tb and population
diseases5["rate"] <- rate1;
head(diseases5);
## country year sex case1 case2 case3 population rate
## 1 Afghanistan 1995 male -1 -1 -1 17586073 -1.705895e-07
## 2 Afghanistan 1996 female -1 -1 -1 18415307 -1.629080e-07
## 3 Afghanistan 1996 male -1 -1 -1 18415307 -1.629080e-07
## 4 Afghanistan 1997 female 5 96 1 19021226 5.362430e-06
## 5 Afghanistan 1997 male 0 26 0 19021226 1.366894e-06
## 6 Afghanistan 1998 female 45 1142 20 19496836 6.190748e-05
# some statistics, Hooray! Here is the results...
display <- diseases5[, c("country","year","rate")];
summary(display$rate);
## V1
## Min. :-4.518e-06
## 1st Qu.: 5.825e-05
## Median : 1.497e-04
## Mean : 2.709e-04
## 3rd Qu.: 3.433e-04
## Max. : 4.098e-03
boxplot(display$rate);
head(display);
## country year rate
## 1 Afghanistan 1995 -1.705895e-07
## 2 Afghanistan 1996 -1.629080e-07
## 3 Afghanistan 1996 -1.629080e-07
## 4 Afghanistan 1997 5.362430e-06
## 5 Afghanistan 1997 1.366894e-06
## 6 Afghanistan 1998 6.190748e-05