ABSTRACT:

AIM AND OBJECTIVE:

Load Libraries:

options(warn = -1)
suppressMessages(library(xlsx))
library(xlsxjars)
library(rJava)
library(devtools)
suppressMessages(library(RMySQL))
library(DBI)
suppressMessages(library(plotly))
suppressWarnings(library(rmongodb))
library(knitr)
library(mongolite)
suppressMessages(library(RCurl))
suppressMessages(library(plyr))
suppressMessages(library(ggplot2))

We will be using the Air Quality dataset from the University of California, Irvine website:

http://archive.ics.uci.edu/ml/datasets/Air+Quality#

https://github.com/mascotinme/MSDA-IS607/blob/master/cckp_historical_data_0.xls

airqu <- read.xlsx2("C:/Users/mayowa/Documents/GitHub/MSDA-IS607/AirQualityUCI.xlsx", sheetName = "AirQualityUCI", header=TRUE, colClasses=NA)

dim(airqu)
## [1] 9471   15
kable(head(airqu))
Date Time CO.GT. PT08.S1.CO. NMHC.GT. C6H6.GT. PT08.S2.NMHC. NOx.GT. PT08.S3.NOx. NO2.GT. PT08.S4.NO2. PT08.S5.O3. T RH AH
2004-03-10 1899-12-30 18:00:00 2.6 1360.00 150 11.881723 1045.50 166 1056.25 113 1692.00 1267.50 13.600 48.875 0.7577538
2004-03-10 1899-12-30 19:00:00 2.0 1292.25 112 9.397165 954.75 103 1173.75 92 1558.75 972.25 13.300 47.700 0.7254874
2004-03-10 1899-12-30 20:00:00 2.2 1402.00 88 8.997817 939.25 131 1140.00 114 1554.50 1074.00 11.900 53.975 0.7502391
2004-03-10 1899-12-30 21:00:00 2.2 1375.50 80 9.228796 948.25 172 1092.00 122 1583.75 1203.25 11.000 60.000 0.7867125
2004-03-10 1899-12-30 22:00:00 1.6 1272.25 51 6.518224 835.50 131 1205.00 116 1490.00 1110.00 11.150 59.575 0.7887942
2004-03-10 1899-12-30 23:00:00 1.2 1197.00 38 4.741012 750.25 89 1336.50 96 1393.00 949.25 11.175 59.175 0.7847717
kable(tail(airqu))
Date Time CO.GT. PT08.S1.CO. NMHC.GT. C6H6.GT. PT08.S2.NMHC. NOx.GT. PT08.S3.NOx. NO2.GT. PT08.S4.NO2. PT08.S5.O3. T RH AH
9466 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9467 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9468 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9469 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9470 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9471 NA NA NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
names(airqu)
##  [1] "Date"          "Time"          "CO.GT."        "PT08.S1.CO."  
##  [5] "NMHC.GT."      "C6H6.GT."      "PT08.S2.NMHC." "NOx.GT."      
##  [9] "PT08.S3.NOx."  "NO2.GT."       "PT08.S4.NO2."  "PT08.S5.O3."  
## [13] "T"             "RH"            "AH"
str(airqu)
## 'data.frame':    9471 obs. of  15 variables:
##  $ Date         :Class 'POSIXct'  atomic [1:9471] 1.08e+09 1.08e+09 1.08e+09 1.08e+09 1.08e+09 ...
##   .. ..- attr(*, "tzone")= chr "GMT"
##  $ Time         :Class 'POSIXct'  atomic [1:9471] -2.21e+09 -2.21e+09 -2.21e+09 -2.21e+09 -2.21e+09 ...
##   .. ..- attr(*, "tzone")= chr "GMT"
##  $ CO.GT.       : num  2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1.CO.  : num  1360 1292 1402 1376 1272 ...
##  $ NMHC.GT.     : num  150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6.GT.     : num  11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2.NMHC.: num  1046 955 939 948 836 ...
##  $ NOx.GT.      : num  166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3.NOx. : num  1056 1174 1140 1092 1205 ...
##  $ NO2.GT.      : num  113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4.NO2. : num  1692 1559 1554 1584 1490 ...
##  $ PT08.S5.O3.  : num  1268 972 1074 1203 1110 ...
##  $ T            : num  13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num  48.9 47.7 54 60 59.6 ...
##  $ AH           : num  0.758 0.725 0.75 0.787 0.789 ...
class(airqu)
## [1] "data.frame"
airqu$NMHC.GT.[is.na(airqu$NMHC.GT.)] <- 0.0000
airqu$CO.GT.[is.na(airqu$CO.GT.)] <- 0.0000
airqu$PT08.S1.CO.[is.na(airqu$PT08.S1.CO.)] <- 0.0000
airqu$C6H6.GT.[is.na(airqu$C6H6.GT.)] <- 0.0000
airqu$PT08.S2.NMHC.[is.na(airqu$PT08.S2.NMHC.)] <- 0.0000
airqu$NOx.GT.[is.na(airqu$NOx.GT.)] <- 0.0000
airqu$PT08.S3.NOx.[is.na(airqu$PT08.S3.NOx.)] <- 0.0000
airqu$NO2.GT.[is.na(airqu$NO2.GT.)] <- 0.0000
airqu$PT08.S4.NO2.[is.na(airqu$PT08.S4.NO2.)] <- 0.0000
airqu$PT08.S5.O3.[is.na(airqu$PT08.S5.O3.)] <- 0.0000
airqu$T[is.na(airqu$T)] <- 0.0000
airqu$RH[is.na(airqu$RH)] <- 0.0000
airqu$AH[is.na(airqu$AH)] <- 0.0000

kable(head(airqu))
Date Time CO.GT. PT08.S1.CO. NMHC.GT. C6H6.GT. PT08.S2.NMHC. NOx.GT. PT08.S3.NOx. NO2.GT. PT08.S4.NO2. PT08.S5.O3. T RH AH
2004-03-10 1899-12-30 18:00:00 2.6 1360.00 150 11.881723 1045.50 166 1056.25 113 1692.00 1267.50 13.600 48.875 0.7577538
2004-03-10 1899-12-30 19:00:00 2.0 1292.25 112 9.397165 954.75 103 1173.75 92 1558.75 972.25 13.300 47.700 0.7254874
2004-03-10 1899-12-30 20:00:00 2.2 1402.00 88 8.997817 939.25 131 1140.00 114 1554.50 1074.00 11.900 53.975 0.7502391
2004-03-10 1899-12-30 21:00:00 2.2 1375.50 80 9.228796 948.25 172 1092.00 122 1583.75 1203.25 11.000 60.000 0.7867125
2004-03-10 1899-12-30 22:00:00 1.6 1272.25 51 6.518224 835.50 131 1205.00 116 1490.00 1110.00 11.150 59.575 0.7887942
2004-03-10 1899-12-30 23:00:00 1.2 1197.00 38 4.741012 750.25 89 1336.50 96 1393.00 949.25 11.175 59.175 0.7847717
colnames(airqu)[1] <- "date"
colnames(airqu)[2] <- "time"
colnames(airqu)[3] <- "co"
colnames(airqu)[4] <- "tin_oxide"
colnames(airqu)[5] <- "hydrocarbon"
colnames(airqu)[6] <- "benzene"
colnames(airqu)[7] <- "titania"
colnames(airqu)[8] <- "nitroge_oxide"
colnames(airqu)[9] <- "tungsten-oxide"
colnames(airqu)[10] <- "no2"
colnames(airqu)[11] <- "tungsten_oxide_o2"
colnames(airqu)[12] <- "indium_oxide"
colnames(airqu)[13] <- "temp_ac"
colnames(airqu)[14] <- "rh"
colnames(airqu)[15] <- "ah"


kable(head(airqu))
date time co tin_oxide hydrocarbon benzene titania nitroge_oxide tungsten-oxide no2 tungsten_oxide_o2 indium_oxide temp_ac rh ah
2004-03-10 1899-12-30 18:00:00 2.6 1360.00 150 11.881723 1045.50 166 1056.25 113 1692.00 1267.50 13.600 48.875 0.7577538
2004-03-10 1899-12-30 19:00:00 2.0 1292.25 112 9.397165 954.75 103 1173.75 92 1558.75 972.25 13.300 47.700 0.7254874
2004-03-10 1899-12-30 20:00:00 2.2 1402.00 88 8.997817 939.25 131 1140.00 114 1554.50 1074.00 11.900 53.975 0.7502391
2004-03-10 1899-12-30 21:00:00 2.2 1375.50 80 9.228796 948.25 172 1092.00 122 1583.75 1203.25 11.000 60.000 0.7867125
2004-03-10 1899-12-30 22:00:00 1.6 1272.25 51 6.518224 835.50 131 1205.00 116 1490.00 1110.00 11.150 59.575 0.7887942
2004-03-10 1899-12-30 23:00:00 1.2 1197.00 38 4.741012 750.25 89 1336.50 96 1393.00 949.25 11.175 59.175 0.7847717

We will now connect R with MySql database.

mysql = dbConnect(MySQL(), user='root', password='oracle', dbname='world', host='localhost') # This connects the R with MySQL. Note that the login entities may be different from your login details, kindly adjust as neccesary.

We would like to view the list of database(s) which are already in MySQL database.

class(mysql)
## [1] "MySQLConnection"
## attr(,"package")
## [1] "RMySQL"
dbListTables(mysql)
## [1] "airqu"           "airqualityuci"   "city"            "country"        
## [5] "countrylanguage" "flights"

Dropping and adding of table if they already exist to give access to the new table to be created.

dbSendQuery(mysql, 'drop table if exists airqu')
## <MySQLResult:2,0,1>
dbWriteTable(conn=mysql, name='airqu', value=airqu)
## [1] TRUE
dbListFields(mysql, 'airqualityuci')
##  [1] "row_names"         "Date"              "Time"             
##  [4] "co"                "tin_oxide"         "hydrocarbon"      
##  [7] "benzene"           "titania"           "nitroge_oxide"    
## [10] "tungsten.oxide"    "no2"               "tungsten_oxide_o2"
## [13] "indium_oxide"      "temp_ac"           "rh"               
## [16] "ah"

We can now read and access the newly created table in R from MySQL directly.

airqualityuci <- dbReadTable(conn=mysql, name='airqu')

write.table(airqualityuci, file = "airqualityuci.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")

Attribute Information :

airqualityuci <- airqualityuci[sample(nrow(airqualityuci), 900), ]
dim(airqualityuci)
## [1] 900  15
str(airqualityuci)
## 'data.frame':    900 obs. of  15 variables:
##  $ date             : chr  "1102377600" "1104624000" "1103068800" "1109116800" ...
##  $ time             : chr  "-2209147200" "-2209129200" "-2209161600" "-2209158000" ...
##  $ co               : num  -200 1.3 2.9 1.1 2.1 2.5 0.5 1.2 2 2.3 ...
##  $ tin_oxide        : num  828 960 -200 952 1327 ...
##  $ hydrocarbon      : num  -200 -200 -200 -200 256 -200 -200 -200 245 -200 ...
##  $ benzene          : num  2.61 3.87 -200 3.21 9.82 ...
##  $ titania          : num  627 704 -200 665 971 ...
##  $ nitroge_oxide    : num  146 226 510 146 124 ...
##  $ tungsten.oxide   : num  1074 887 -200 907 803 ...
##  $ no2              : num  64 106 115 119 89 ...
##  $ tungsten_oxide_o2: num  1102 1002 -200 1007 1705 ...
##  $ indium_oxide     : num  748 947 -200 703 1120 ...
##  $ temp_ac          : num  11.9 3.3 -200 4 21.6 ...
##  $ rh               : num  71.4 68.2 -200 79.3 41.7 ...
##  $ ah               : num  0.992 0.535 -200 0.652 1.061 ...
summary(airqualityuci)
##      date               time                 co             tin_oxide     
##  Length:900         Length:900         Min.   :-200.000   Min.   :-200.0  
##  Class :character   Class :character   1st Qu.:   0.575   1st Qu.: 913.2  
##  Mode  :character   Mode  :character   Median :   1.500   Median :1048.4  
##                                        Mean   : -33.994   Mean   :1039.5  
##                                        3rd Qu.:   2.600   3rd Qu.:1217.9  
##                                        Max.   :  11.900   Max.   :2007.8  
##   hydrocarbon        benzene            titania       nitroge_oxide    
##  Min.   :-200.0   Min.   :-200.000   Min.   :-200.0   Min.   :-200.00  
##  1st Qu.:-200.0   1st Qu.:   3.824   1st Qu.: 700.9   1st Qu.:  48.92  
##  Median :-200.0   Median :   7.647   Median : 884.5   Median : 131.50  
##  Mean   :-150.1   Mean   :   1.954   Mean   : 880.3   Mean   : 163.60  
##  3rd Qu.:-200.0   3rd Qu.:  13.967   3rd Qu.:1115.6   3rd Qu.: 279.00  
##  Max.   : 926.0   Max.   :  50.633   Max.   :1980.2   Max.   :1389.00  
##  tungsten.oxide        no2          tungsten_oxide_o2  indium_oxide   
##  Min.   :-200.0   Min.   :-200.00   Min.   :-200      Min.   :-200.0  
##  1st Qu.: 629.9   1st Qu.:  50.75   1st Qu.:1189      1st Qu.: 689.3  
##  Median : 804.2   Median :  91.75   Median :1459      Median : 937.2  
##  Mean   : 796.7   Mean   :  54.87   Mean   :1387      Mean   : 963.6  
##  3rd Qu.: 975.2   3rd Qu.: 127.85   3rd Qu.:1681      3rd Qu.:1245.8  
##  Max.   :2559.2   Max.   : 312.40   Max.   :2643      Max.   :2515.2  
##     temp_ac               rh                ah           
##  Min.   :-200.000   Min.   :-200.00   Min.   :-200.0000  
##  1st Qu.:  11.325   1st Qu.:  34.26   1st Qu.:   0.7046  
##  Median :  17.900   Median :  48.95   Median :   0.9803  
##  Mean   :   9.923   Mean   :  39.58   Mean   :  -6.5654  
##  3rd Qu.:  23.800   3rd Qu.:  61.29   3rd Qu.:   1.3131  
##  Max.   :  41.600   Max.   :  85.70   Max.   :   2.1766

What recommendations has the federal government made to protect human health?

….EPA estimates that 10 ppb benzene in drinking water that is consumed regularly or exposure to 0.4 ppb in air over a lifetime could cause a risk of one additional cancer case for every 100,000 exposed persons……

mysqlquery <- dbGetQuery(mysql, "SELECT Date, Time, benzene 
     FROM airqu 
     WHERE benzene IN (SELECT benzene 
                  FROM airqu 
                  WHERE benzene > 10)")

kable(head(mysqlquery))
Date Time benzene
1078876800 -2209096800 11.88172
1078963200 -2209111200 11.53901
1078963200 -2209100400 11.15158
1078963200 -2209096800 20.79922
1078963200 -2209093200 27.35981
1078963200 -2209089600 24.01776

MONGODB —A NO-SQL

mongo_conn <- mongo.create() # createing a MongoDB connection

mongo.is.connected(mongo_conn)  # checking if the mongo is connected
## [1] TRUE
if(mongo.is.connected(mongo_conn) == TRUE) {
  mongo.get.databases(mongo_conn)
}
## [1] "database" "flights"  "hockey"   "world"
if(mongo.is.connected(mongo_conn) == TRUE) {
  db <- "world"
  mongo.get.database.collections(mongo_conn, db)
}
## character(0)
mongo_conn = mongo(collection = "airqualityuci",  db = "world")
mongo_conn$insert(airqualityuci) # Insert the airqualityuci into mongodb collection
## 
Complete! Processed total of 900 rows.
## [1] TRUE
plot(airqualityuci$temp_ac, airqualityuci$benzene, main = "Scatter Plot", xlab = "Temperature", ylab = "Benzene")
abline(lm(airqualityuci$temp_ac~airqualityuci$benzene), col = "blue");

cor(airqualityuci$temp_ac, airqualityuci$benzene)
## [1] 0.9719243
temp_co <- lm(temp_ac ~ benzene, data=airqualityuci)

summary.lm(temp_co)
## 
## Call:
## lm(formula = temp_ac ~ benzene, data = airqualityuci)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.799  -5.388   0.069   6.306  27.363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.943506   0.333563   23.81   <2e-16 ***
## benzene     1.012787   0.008182  123.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.995 on 898 degrees of freedom
## Multiple R-squared:  0.9446, Adjusted R-squared:  0.9446 
## F-statistic: 1.532e+04 on 1 and 898 DF,  p-value: < 2.2e-16
plot(x=temp_co$residuals, y=temp_co$benzene)
abline(h = 0, lty = 3, col = "blue")

hist(temp_co$residuals)

airqual <- subset(airqualityuci, select=c(co, tin_oxide, hydrocarbon, benzene, titania, nitroge_oxide, tungsten.oxide, no2, tungsten_oxide_o2, indium_oxide, temp_ac, rh, ah  ))

plot(airqual)

We are going to do a Multiple Linear Regression model. But, we will want to chose the best model for our prediction.

\({ Y }\quad =\quad { B }_{ 0 }\quad +\quad { B }_{ 1 }{ X }_{ 1 }\quad +\quad { B }_{ 2 }{ X }_{ 2 }\quad +\quad\) ………+\(\quad { B }_{ n }{ X }_{ n }\quad\) +\(\quad { e}\\\)

SETTING UP HYPOTHESIS: # 1

\(H_a:\) Not \(H_o\) : \(\mu_1\) \(\neq\) \(\mu_2\) \(\neq\) \(\mu_3\)…..\(\neq\) \(\mu_n\)

Rejection:

Reject \(H_o\) (Null Hypothesis) if the calculated value (P-Value) is less that the tabulated value(Table value = 0.05 ), otherwise do not reject \(H_o\)

modelselection <- lm(temp_ac ~ tin_oxide+hydrocarbon+benzene+titania+nitroge_oxide+tungsten.oxide+no2+tungsten_oxide_o2+indium_oxide+co+rh+ah, data=airqual)

stepwise <- step(modelselection, direction = "both") # Model Selection using both FORWARD AND BACKWARD selection.
## Start:  AIC=1998.93
## temp_ac ~ tin_oxide + hydrocarbon + benzene + titania + nitroge_oxide + 
##     tungsten.oxide + no2 + tungsten_oxide_o2 + indium_oxide + 
##     co + rh + ah
## 
##                     Df Sum of Sq     RSS    AIC
## - co                 1      12.0  8071.0 1998.3
## <none>                            8059.0 1998.9
## - tin_oxide          1      41.4  8100.3 2001.5
## - titania            1     180.2  8239.1 2016.8
## - indium_oxide       1     316.6  8375.6 2031.6
## - tungsten.oxide     1     406.2  8465.1 2041.2
## - no2                1     464.0  8523.0 2047.3
## - nitroge_oxide      1     562.3  8621.3 2057.6
## - hydrocarbon        1    1102.7  9161.6 2112.3
## - benzene            1    3437.7 11496.7 2316.7
## - tungsten_oxide_o2  1   14688.1 22747.1 2930.8
## - rh                 1   15311.8 23370.8 2955.2
## - ah                 1   17500.6 25559.6 3035.7
## 
## Step:  AIC=1998.27
## temp_ac ~ tin_oxide + hydrocarbon + benzene + titania + nitroge_oxide + 
##     tungsten.oxide + no2 + tungsten_oxide_o2 + indium_oxide + 
##     rh + ah
## 
##                     Df Sum of Sq     RSS    AIC
## <none>                            8071.0 1998.3
## + co                 1      12.0  8059.0 1998.9
## - tin_oxide          1      44.1  8115.1 2001.2
## - titania            1     179.1  8250.1 2016.0
## - indium_oxide       1     311.5  8382.5 2030.3
## - tungsten.oxide     1     405.3  8476.3 2040.4
## - nitroge_oxide      1     553.7  8624.7 2056.0
## - no2                1     605.1  8676.1 2061.3
## - hydrocarbon        1    1153.0  9224.0 2116.4
## - benzene            1    3432.7 11503.7 2315.2
## - tungsten_oxide_o2  1   14676.7 22747.7 2928.8
## - rh                 1   15330.6 23401.6 2954.3
## - ah                 1   17489.2 25560.2 3033.8
summary(stepwise)
## 
## Call:
## lm(formula = temp_ac ~ tin_oxide + hydrocarbon + benzene + titania + 
##     nitroge_oxide + tungsten.oxide + no2 + tungsten_oxide_o2 + 
##     indium_oxide + rh + ah, data = airqual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.545  -1.923   0.072   1.880  11.348 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.8053615  0.8533980   0.944   0.3456    
## tin_oxide          0.0028808  0.0013080   2.202   0.0279 *  
## hydrocarbon       -0.0087971  0.0007811 -11.263  < 2e-16 ***
## benzene           -1.0456326  0.0538047 -19.434  < 2e-16 ***
## titania            0.0090238  0.0020330   4.439 1.02e-05 ***
## nitroge_oxide      0.0079542  0.0010191   7.805 1.67e-14 ***
## tungsten.oxide    -0.0033219  0.0004974  -6.678 4.27e-11 ***
## no2               -0.0136231  0.0016696  -8.160 1.14e-15 ***
## tungsten_oxide_o2  0.0237220  0.0005903  40.184  < 2e-16 ***
## indium_oxide      -0.0042364  0.0007237  -5.854 6.75e-09 ***
## rh                -0.3130253  0.0076218 -41.070  < 2e-16 ***
## ah                 2.3493839  0.0535582  43.866  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.015 on 888 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.995 
## F-statistic: 1.613e+04 on 11 and 888 DF,  p-value: < 2.2e-16

ANALYSIS OF VARIANCE (ANOVA)

anova(stepwise, test= "F")
## Analysis of Variance Table
## 
## Response: temp_ac
##                    Df Sum Sq Mean Sq  F value    Pr(>F)    
## tin_oxide           1 826842  826842 90972.22 < 2.2e-16 ***
## hydrocarbon         1  15604   15604  1716.86 < 2.2e-16 ***
## benzene             1 706894  706894 77775.04 < 2.2e-16 ***
## titania             1   1203    1203   132.38 < 2.2e-16 ***
## nitroge_oxide       1  15046   15046  1655.40 < 2.2e-16 ***
## tungsten.oxide      1   2218    2218   244.00 < 2.2e-16 ***
## no2                 1   6032    6032   663.65 < 2.2e-16 ***
## tungsten_oxide_o2   1  11437   11437  1258.34 < 2.2e-16 ***
## indium_oxide        1   2209    2209   243.04 < 2.2e-16 ***
## rh                  1   7470    7470   821.89 < 2.2e-16 ***
## ah                  1  17489   17489  1924.23 < 2.2e-16 ***
## Residuals         888   8071       9                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(stepwise)
##                           2.5 %       97.5 %
## (Intercept)       -0.8695506430  2.480273732
## tin_oxide          0.0003135715  0.005447929
## hydrocarbon       -0.0103300920 -0.007264198
## benzene           -1.1512318317 -0.940033361
## titania            0.0050338134  0.013013856
## nitroge_oxide      0.0059541156  0.009954319
## tungsten.oxide    -0.0042981694 -0.002345581
## no2               -0.0168999194 -0.010346287
## tungsten_oxide_o2  0.0225634311  0.024880636
## indium_oxide      -0.0056567585 -0.002816125
## rh                -0.3279841095 -0.298066458
## ah                 2.2442684699  2.454499276

Decision:

From the above multiple regression analysis and ANOVA, we found that the P-value is actually low compare to the Table value (0.05), we therefor reject \(H_o\) (Null hypothesis) and conclude that there is/are difference(s) between the variables.

The Error Term or Residual

residual_temp = data.frame(Fitted = fitted(stepwise),
Residuals = resid(stepwise), Treatment = airqual$temp_ac)

residual_benzene = data.frame(Fitted = fitted(stepwise),
Residuals = resid(stepwise), Treatment = airqual$benzene)



ggplot(residual_temp, aes(Fitted, Residuals, colour = Treatment)) + geom_point()

ggplot(residual_benzene, aes(Fitted, Residuals, colour = Treatment)) + geom_point()

SECOND DATASET WAS EXTRACTED FROM THE WORLD BANK WEBSITE AS A .XLS FILE

suppressMessages(library(dplyr))

dataset2 <- read.xlsx("C:/Users/mayowa/Documents/cckp_historical_data_0.xls", sheetIndex=4,
  startRow=NULL, endRow=179, colIndex=NULL,
  as.data.frame=TRUE, header=TRUE,
  keepFormulas=FALSE)

str(dataset2)
## 'data.frame':    178 obs. of  14 variables:
##  $ ISO_3DIGIT : Factor w/ 178 levels "AFG","AGO","ALB",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Jan_Temp   : num  0.0731 22.5822 2.0231 18.4275 20.8035 ...
##  $ Feb_temp   : num  2.11 22.68 3.22 19.43 19.9 ...
##  $ Mar_temp   : num  7.6 22.78 6.04 22.61 17.51 ...
##  $ Apr_Temp   : num  13.37 22.35 9.92 26.58 14.05 ...
##  $ May_temp   : num  18.2 20.7 14.4 30.6 10.6 ...
##  $ Jun_Temp   : num  23.2 18.37 17.93 32.46 7.66 ...
##  $ July_Temp  : num  25.26 17.95 20.54 33.8 7.42 ...
##  $ Aug_Temp   : num  23.77 19.9 20.48 33.55 9.02 ...
##  $ Sept_temp  : num  19 22.2 17.2 31.7 11.5 ...
##  $ Oct_temp   : num  13 23.2 12.3 28.3 14.7 ...
##  $ Nov_Temp   : num  7 22.79 7.58 24.06 17.54 ...
##  $ Dec_temp   : num  2.43 22.61 3.65 20.28 19.83 ...
##  $ Annual_temp: num  12.9 21.5 11.3 26.8 14.2 ...
kable(summary(dataset2))
ISO_3DIGIT Jan_Temp Feb_temp Mar_temp Apr_Temp May_temp Jun_Temp July_Temp Aug_Temp Sept_temp Oct_temp Nov_Temp Dec_temp Annual_temp
AFG : 1 Min. :-26.9967 Min. :-24.725 Min. :-18.717 Min. :-14.249 Min. :-6.319 Min. : 0.07677 Min. : 2.02 Min. : 2.583 Min. :-2.336 Min. :-8.36 Min. :-17.638 Min. :-24.074 Min. :-9.143
AGO : 1 1st Qu.: 0.1473 1st Qu.: 2.092 1st Qu.: 5.265 1st Qu.: 9.689 1st Qu.:14.050 1st Qu.:16.36776 1st Qu.:18.05 1st Qu.:18.455 1st Qu.:15.333 1st Qu.:10.85 1st Qu.: 5.953 1st Qu.: 2.041 1st Qu.:10.091
ALB : 1 Median : 19.2529 Median : 20.492 Median : 21.995 Median : 22.028 Median :21.907 Median :22.99741 Median :22.98 Median :23.314 Median :22.598 Median :22.49 Median : 21.282 Median : 20.043 Median :21.582
ARE : 1 Mean : 12.8916 Mean : 13.879 Mean : 15.935 Mean : 18.241 Mean :20.160 Mean :21.34465 Mean :22.01 Mean :21.941 Mean :20.757 Mean :18.70 Mean : 15.992 Mean : 13.729 Mean :17.965
ARG : 1 3rd Qu.: 24.2336 3rd Qu.: 24.821 3rd Qu.: 25.333 3rd Qu.: 25.633 3rd Qu.:25.898 3rd Qu.:26.01320 3rd Qu.:26.05 3rd Qu.:26.057 3rd Qu.:25.772 3rd Qu.:25.32 3rd Qu.: 24.545 3rd Qu.: 23.983 3rd Qu.:25.046
ARM : 1 Max. : 27.7845 Max. : 29.088 Max. : 30.629 Max. : 32.055 Max. :33.062 Max. :34.72038 Max. :36.25 Max. :35.878 Max. :32.778 Max. :29.55 Max. : 27.473 Max. : 27.302 Max. :28.300
(Other):172 NA NA NA NA NA NA NA NA NA NA NA NA NA

we are going to sample 172 observations from the 1st dataset to match with the second dataset to avoid bias.

SETTING UP HYPOTHESIS: # 2

Rejection:

Reject \(H_o\) (Null Hypothesis) if the calculated value (P-Value) is less that the tabulated value(Table value = 0.05 ), otherwise do not reject \(H_o\)

Decision:

We therefore Accept \(H_o\), since calculated calculated value (P-Value) is greater than the tabulated value(Table value = 0.05 )

sample1 <- airqual[sample(nrow(airqual), 178), ]
dim(sample1)
## [1] 178  13
temp_2 <- lm(sample1$temp_ac ~ dataset2$Annual_temp)

summary.lm(temp_2)
## 
## Call:
## lm(formula = sample1$temp_ac ~ dataset2$Annual_temp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -214.339    1.089    7.824   12.811   33.684 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           19.3854     7.0031   2.768  0.00624 **
## dataset2$Annual_temp  -0.4388     0.3516  -1.248  0.21363   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.36 on 176 degrees of freedom
## Multiple R-squared:  0.008774,   Adjusted R-squared:  0.003142 
## F-statistic: 1.558 on 1 and 176 DF,  p-value: 0.2136
cor.test(sample1$temp_ac, dataset2$Annual_temp, method = c("pearson", "kendall", "spearman"),
         exact = NULL, conf.level = 0.95, continuity = FALSE)
## 
##  Pearson's product-moment correlation
## 
## data:  sample1$temp_ac and dataset2$Annual_temp
## t = -1.2482, df = 176, p-value = 0.2136
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.23748263  0.05416107
## sample estimates:
##         cor 
## -0.09366983
anova(temp_2)
## Analysis of Variance Table
## 
## Response: sample1$temp_ac
##                       Df Sum Sq Mean Sq F value Pr(>F)
## dataset2$Annual_temp   1   2537  2537.2  1.5579 0.2136
## Residuals            176 286636  1628.6

DATASET THREE(3)

SETTING UP HYPOTHESIS: # 3

Rejection:

Reject \(H_o\) (Null Hypothesis) if the calculated value (P-Value) is less than the tabulated value(Table value = 0.05 ), otherwise do not reject \(H_o\)

library(plyr)

incd <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS607/master/incd.csv", sep = ",", header = TRUE, skip = 8)
 
str(incd)
## 'data.frame':    75 obs. of  10 variables:
##  $ State                                             : Factor w/ 71 levels "","# Data do not include cases diagnosed in other states for those states in which the data exchange agreement specifically prohib"| __truncated__,..: 64 70 41 37 29 33 50 52 44 34 ...
##  $ FIPS                                              : int  0 55000 27000 23000 16000 19000 36000 38000 30000 20000 ...
##  $ Age.Adjusted.Incidence.Rate......cases.per.100.000: Factor w/ 35 levels "","¶¶ ","10.2",..: 17 35 34 33 32 31 30 29 28 27 ...
##  $ Lower.95..Confidence.Interval                     : Factor w/ 35 levels "","¶¶","10","10.4",..: 22 34 33 32 30 31 32 26 27 29 ...
##  $ Upper.95..Confidence.Interval                     : Factor w/ 36 levels ""," ¶¶","11",..: 15 36 34 35 32 30 28 33 31 29 ...
##  $ Average.Annual.Count                              : Factor w/ 53 levels "","¶¶","1005",..: 30 4 52 20 19 35 27 6 14 31 ...
##  $ Recent.Trend                                      : Factor w/ 5 levels "","¶¶","falling",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Recent.5.Year.Trend.....in.Incidence.Rates        : Factor w/ 38 levels "","-0.1","-0.3",..: 15 2 32 17 6 15 26 12 33 28 ...
##  $ Lower.95..Confidence.Interval.1                   : Factor w/ 43 levels "","-0.1","-0.3",..: 7 22 16 34 10 3 20 35 19 2 ...
##  $ Upper.95..Confidence.Interval.1                   : Factor w/ 43 levels "","-0.1","¶¶",..: 7 25 43 40 38 5 37 8 12 29 ...
# Rename the column names for easy accesibility

incd$Average.Annual.Count <- as.numeric(incd$Average.Annual.Count)

names(incd) <- c("state", "FIPS", "adj_age_100000", "lower_Adj_CI", "upper_Adj_CI", "avg_annual_count", "Recent.Trend", "5years_rate", "Lower_5Yrs_CI", "upper_5Yrs_CI")



# We are going to seperate the rows(Stable, Rising and Falling)


str(incd)
## 'data.frame':    75 obs. of  10 variables:
##  $ state           : Factor w/ 71 levels "","# Data do not include cases diagnosed in other states for those states in which the data exchange agreement specifically prohib"| __truncated__,..: 64 70 41 37 29 33 50 52 44 34 ...
##  $ FIPS            : int  0 55000 27000 23000 16000 19000 36000 38000 30000 20000 ...
##  $ adj_age_100000  : Factor w/ 35 levels "","¶¶ ","10.2",..: 17 35 34 33 32 31 30 29 28 27 ...
##  $ lower_Adj_CI    : Factor w/ 35 levels "","¶¶","10","10.4",..: 22 34 33 32 30 31 32 26 27 29 ...
##  $ upper_Adj_CI    : Factor w/ 36 levels ""," ¶¶","11",..: 15 36 34 35 32 30 28 33 31 29 ...
##  $ avg_annual_count: num  30 4 52 20 19 35 27 6 14 31 ...
##  $ Recent.Trend    : Factor w/ 5 levels "","¶¶","falling",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ 5years_rate     : Factor w/ 38 levels "","-0.1","-0.3",..: 15 2 32 17 6 15 26 12 33 28 ...
##  $ Lower_5Yrs_CI   : Factor w/ 43 levels "","-0.1","-0.3",..: 7 22 16 34 10 3 20 35 19 2 ...
##  $ upper_5Yrs_CI   : Factor w/ 43 levels "","-0.1","¶¶",..: 7 25 43 40 38 5 37 8 12 29 ...
kable(head(incd))
state FIPS adj_age_100000 lower_Adj_CI upper_Adj_CI avg_annual_count Recent.Trend 5years_rate Lower_5Yrs_CI upper_5Yrs_CI
US (SEER+NPCR)(1,10) 0 13.2 13.1 13.2 30 stable 0 -1.3 1.2
Wisconsin(6,10) 55000 16.5 16.1 17 4 stable -0.1 -3.6 3.6
Minnesota(6,10) 27000 16 15.5 16.4 52 stable 3.4 -2.7 9.8
Maine(6,10) 23000 15.7 14.8 16.6 20 stable 0.3 -8 9.2
Idaho(6,10) 16000 15.3 14.4 16.2 19 stable -1.7 -10.4 7.7
Iowa(3,8) 19000 15.2 14.6 15.8 35 stable 0 -0.3 0.3
stable <- subset(incd, incd$Recent.Trend == "stable")
rising <- subset(incd, incd$Recent.Trend == "rising")
falling <- subset(incd, incd$Recent.Trend == "falling")

kable(head(stable))
state FIPS adj_age_100000 lower_Adj_CI upper_Adj_CI avg_annual_count Recent.Trend 5years_rate Lower_5Yrs_CI upper_5Yrs_CI
US (SEER+NPCR)(1,10) 0 13.2 13.1 13.2 30 stable 0 -1.3 1.2
Wisconsin(6,10) 55000 16.5 16.1 17 4 stable -0.1 -3.6 3.6
Minnesota(6,10) 27000 16 15.5 16.4 52 stable 3.4 -2.7 9.8
Maine(6,10) 23000 15.7 14.8 16.6 20 stable 0.3 -8 9.2
Idaho(6,10) 16000 15.3 14.4 16.2 19 stable -1.7 -10.4 7.7
Iowa(3,8) 19000 15.2 14.6 15.8 35 stable 0 -0.3 0.3
kable(head(rising))
state FIPS adj_age_100000 lower_Adj_CI upper_Adj_CI avg_annual_count Recent.Trend 5years_rate Lower_5Yrs_CI upper_5Yrs_CI
12 Connecticut(3,8) 9000 14.7 14.2 15.2 38 rising 0.8 0.4 1.2
14 New Jersey(3,8) 34000 14.2 13.9 14.6 9 rising 0.4 0.1 0.6
24 Utah(3,8) 49000 13.3 12.6 14 26 rising 0.8 0.3 1.4
29 Tennessee(6,10) 47000 13.1 12.7 13.5 50 rising 2.6 0.1 5.2
34 Missouri(6,10) 29000 12.7 12.3 13.1 47 rising 2.4 0.3 4.6
43 Hawaii(3,8) 15000 12 11.2 12.8 16 rising 1.6 0.6 2.5
kable(head(falling))
state FIPS adj_age_100000 lower_Adj_CI upper_Adj_CI avg_annual_count Recent.Trend 5years_rate Lower_5Yrs_CI upper_5Yrs_CI
37 California(3,9) 6000 12.5 12.4 12.7 32 falling -1.7 -3.4 -0.1
require(ggplot2)

ggplot(stable, col = "red", aes(x =state, y=round(avg_annual_count,2))) +
       geom_bar(stat="identity",position="dodge") + 
       xlab("States") + 
       ylab("Average Annual counts") + 
       ggtitle("Average Annual Stable Leukemia Counts per US States")

ggplot(rising, col = "blue", aes(x =state, y=round(avg_annual_count,2))) +
       geom_bar(stat="identity",position="dodge") + 
       xlab("States") + 
       ylab("Average Annual counts") + 
       ggtitle("Average Annual Rising Leukemia Counts per US States")

sample_all1 <- sample1[sample(nrow(sample1), 45), ]
sample_all2 <- dataset2[sample(nrow(dataset2), 45), ]
sample_all3 <- incd[sample(nrow(incd), 45), ]
allsamples <- data.frame(c(sample_all1, sample_all2, sample_all3))


all_dataset_subset <- allsamples[, c("avg_annual_count", "Annual_temp" , "co", "benzene")]
str(all_dataset_subset)
## 'data.frame':    45 obs. of  4 variables:
##  $ avg_annual_count: num  52 41 1 8 1 21 1 6 2 1 ...
##  $ Annual_temp     : num  23.4 24.8 10.9 21.9 25.7 ...
##  $ co              : num  0.7 1.4 1 0.9 1.5 1 0.7 3.2 0.5 -200 ...
##  $ benzene         : num  4.03 8.08 6.67 4.43 7.26 ...
linearmodel <- lm(all_dataset_subset$avg_annual_count ~(all_dataset_subset$co + all_dataset_subset$benzene))
linearmodel
## 
## Call:
## lm(formula = all_dataset_subset$avg_annual_count ~ (all_dataset_subset$co + 
##     all_dataset_subset$benzene))
## 
## Coefficients:
##                (Intercept)       all_dataset_subset$co  
##                   12.92002                    -0.04277  
## all_dataset_subset$benzene  
##                    0.69899
anova(linearmodel)
## Analysis of Variance Table
## 
## Response: all_dataset_subset$avg_annual_count
##                            Df  Sum Sq Mean Sq F value  Pr(>F)  
## all_dataset_subset$co       1   194.3  194.25  0.6494 0.42486  
## all_dataset_subset$benzene  1  1462.1 1462.11  4.8881 0.03254 *
## Residuals                  42 12562.9  299.12                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Decision:

We therefore Reject \(H_o\), since calculated calculated value (P-Value) is greater than the tabulated value(Table value = 0.05 )

GENERAL CONCLUSION:

Based on the above analysis done on the three databases, we therefore conclude that there is strong relationship between the carbon-monoxide (co), Temperature (temp_ac) and Benzene. Though, we the datasets might not be able to dtermine the actual casue of the link between the variables, we are strongly confident that a reduction in the exposure to their atmospheric condition (particularyly Benzene) may aid in reducing leukemia in the United States and the World at large.

Note: The analysis is not exclusive as some other contributing factor(s) may also needs to be accessed.

REFERENCES: