Q 3.1 Get Ebola data from the CDC.

The CDC provides data on the recent Ebola Outbreak “http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html”. Read the Ebola outbreak data and store as a nice data frame.
library(XML)
## Warning: package 'XML' was built under R version 3.1.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.2
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- 'http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html'
Ebola1 <- readHTMLTable(url)[[1]] #read w/o colClasses
Ebola2 <- readHTMLTable(url)[[2]] #read w/o colClasses
Ebola3 <- readHTMLTable(url)[[3]] #read w/o colClasses


Ebola1.df <- tbl_df(Ebola1)
Ebola2.df <- tbl_df(Ebola2)
Ebola3.df <- tbl_df(Ebola3)

Ebola1.df
## Source: local data frame [4 x 4]
## 
##        Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1       Guinea        1971                       1698         1192
## 2      Liberia        7069                       2643         2964
## 3 Sierra Leone        6073                       5056         1250
## 4        Total       15113                       9397         5406
Ebola2.df
## Source: local data frame [3 x 4]
## 
##         Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 United States           4                          4            1
## 2          Mali           6                          5            5
## 3         Total          10                          9            6
Ebola3.df
## Source: local data frame [4 x 4]
## 
##    Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 Nigeria*          20                         19            8
## 2 Senegal*           1                          1            0
## 3    Spain           1                          1            0
## 4    Total          22                         21            8

Q3.2 Clean up the numbers (no asterisk)

Some numbers in the data above have an asterisk. (19**). Get rid of those ‘stars’

# Ebola2.df and Ebola3.df has "*" 

url <- 'http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html'
Ebolal_main <- readHTMLTable(url) #read w/o colClasses
Ebolal_main
## $`cases-widespread`
##        Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1       Guinea        1971                       1698         1192
## 2      Liberia        7069                       2643         2964
## 3 Sierra Leone        6073                       5056         1250
## 4        Total       15113                       9397         5406
## 
## $`cases-travel-associated`
##         Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 United States           4                          4            1
## 2          Mali           6                          5            5
## 3         Total          10                          9            6
## 
## $`cases-localized-transmission`
##    Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 Nigeria*          20                         19            8
## 2 Senegal*           1                          1            0
## 3    Spain           1                          1            0
## 4    Total          22                         21            8
# First table is fine

# table 2:
ebola_table2 <- as.data.frame((gsub("\\*", "", as.matrix(Ebolal_main[[2]][,-5]))))
ebola_table2
##         Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 United States           4                          4            1
## 2          Mali           6                          5            5
## 3         Total          10                          9            6
# table 3:
ebola_table3 <- as.data.frame((gsub("\\*", "", as.matrix(Ebolal_main[[3]][,-5]))))
ebola_table3
##   Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1 Nigeria          20                         19            8
## 2 Senegal           1                          1            0
## 3   Spain           1                          1            0
## 4   Total          22                         21            8

Q 3.3 Combine the 3 tables in the page into one dataframe.

combined <- rbind(Ebola1.df, ebola_table2, ebola_table3)
combined
## Source: local data frame [11 x 4]
## 
##          Country Total Cases Laboratory-Confirmed Cases Total Deaths
## 1         Guinea        1971                       1698         1192
## 2        Liberia        7069                       2643         2964
## 3   Sierra Leone        6073                       5056         1250
## 4          Total       15113                       9397         5406
## 5  United States           4                          4            1
## 6           Mali           6                          5            5
## 7          Total          10                          9            6
## 8        Nigeria          20                         19            8
## 9        Senegal           1                          1            0
## 10         Spain           1                          1            0
## 11         Total          22                         21            8

Q3.4 Write an R Function

# 

Q3.5 Reading from Wikipedia:

http://en.wikipedia.org/wiki/List_of_Ebola_outbreaks Find out how many humans have died of Ebola since 1976. . Hint: Consider using readHTMLTable {XML package}. . If a page has multiple tables, and you want the second table, you’d do [[2]]. . If a column name has spaces, you put - Example: df$“Human deaths”

Print the total number of Human deaths from this table.

# 1: get the table using "readHTMLTable":

url2 <- 'http://en.wikipedia.org/wiki/List_of_Ebola_outbreaks'
ebola_wiki <- readHTMLTable(url2,colClasses= c('character', 'character', 'character' , 'character','character','character','character'))
Sum_human_deaths <- sum(as.numeric(gsub(",","",ebola_wiki[[2]]$"Human deaths")))
## Warning: NAs introduced by coercion
Sum_human_deaths
## [1] NA

Q 3.6 dplyr

Take the readymade dataset airquality

Q: Calculate the average Ozone levels for each month in the data.

(Make sure you avoid the NA values!)

library(dplyr)
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
names(airquality)
## [1] "Ozone"   "Solar.R" "Wind"    "Temp"    "Month"   "Day"
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
# Method 1:
df1 <- aggregate(airquality, by=list(airquality$Month), FUN=mean, na.rm=TRUE)
df1 %>%  
    select(  Month, Ozone)
##   Month    Ozone
## 1     5 23.61538
## 2     6 29.44444
## 3     7 59.11538
## 4     8 59.96154
## 5     9 31.44828
# Method 2 (This one does not seem to think about the NA values)
aggregate(data=airquality, Ozone~Month, mean) 
##   Month    Ozone
## 1     5 23.61538
## 2     6 29.44444
## 3     7 59.11538
## 4     8 59.96154
## 5     9 31.44828
# Why this one does not require "na.rm=TURE"??

# Ploting:
aggregate(data=airquality, Ozone~Month, mean) %>% 
    with(barplot(Ozone, main="AVG Ozone by Month"))

# Method 3:
# 1. Check row 10 for Ozone with NA:
airquality[10, ]
##    Ozone Solar.R Wind Temp Month Day
## 10    NA     194  8.6   69     5  10
# 2. Get rid of NA for Ozon:
airquality_new <- airquality[!is.na(airquality$Ozone) , ]
airquality_new
##     Ozone Solar.R Wind Temp Month Day
## 1      41     190  7.4   67     5   1
## 2      36     118  8.0   72     5   2
## 3      12     149 12.6   74     5   3
## 4      18     313 11.5   62     5   4
## 6      28      NA 14.9   66     5   6
## 7      23     299  8.6   65     5   7
## 8      19      99 13.8   59     5   8
## 9       8      19 20.1   61     5   9
## 11      7      NA  6.9   74     5  11
## 12     16     256  9.7   69     5  12
## 13     11     290  9.2   66     5  13
## 14     14     274 10.9   68     5  14
## 15     18      65 13.2   58     5  15
## 16     14     334 11.5   64     5  16
## 17     34     307 12.0   66     5  17
## 18      6      78 18.4   57     5  18
## 19     30     322 11.5   68     5  19
## 20     11      44  9.7   62     5  20
## 21      1       8  9.7   59     5  21
## 22     11     320 16.6   73     5  22
## 23      4      25  9.7   61     5  23
## 24     32      92 12.0   61     5  24
## 28     23      13 12.0   67     5  28
## 29     45     252 14.9   81     5  29
## 30    115     223  5.7   79     5  30
## 31     37     279  7.4   76     5  31
## 38     29     127  9.7   82     6   7
## 40     71     291 13.8   90     6   9
## 41     39     323 11.5   87     6  10
## 44     23     148  8.0   82     6  13
## 47     21     191 14.9   77     6  16
## 48     37     284 20.7   72     6  17
## 49     20      37  9.2   65     6  18
## 50     12     120 11.5   73     6  19
## 51     13     137 10.3   76     6  20
## 62    135     269  4.1   84     7   1
## 63     49     248  9.2   85     7   2
## 64     32     236  9.2   81     7   3
## 66     64     175  4.6   83     7   5
## 67     40     314 10.9   83     7   6
## 68     77     276  5.1   88     7   7
## 69     97     267  6.3   92     7   8
## 70     97     272  5.7   92     7   9
## 71     85     175  7.4   89     7  10
## 73     10     264 14.3   73     7  12
## 74     27     175 14.9   81     7  13
## 76      7      48 14.3   80     7  15
## 77     48     260  6.9   81     7  16
## 78     35     274 10.3   82     7  17
## 79     61     285  6.3   84     7  18
## 80     79     187  5.1   87     7  19
## 81     63     220 11.5   85     7  20
## 82     16       7  6.9   74     7  21
## 85     80     294  8.6   86     7  24
## 86    108     223  8.0   85     7  25
## 87     20      81  8.6   82     7  26
## 88     52      82 12.0   86     7  27
## 89     82     213  7.4   88     7  28
## 90     50     275  7.4   86     7  29
## 91     64     253  7.4   83     7  30
## 92     59     254  9.2   81     7  31
## 93     39      83  6.9   81     8   1
## 94      9      24 13.8   81     8   2
## 95     16      77  7.4   82     8   3
## 96     78      NA  6.9   86     8   4
## 97     35      NA  7.4   85     8   5
## 98     66      NA  4.6   87     8   6
## 99    122     255  4.0   89     8   7
## 100    89     229 10.3   90     8   8
## 101   110     207  8.0   90     8   9
## 104    44     192 11.5   86     8  12
## 105    28     273 11.5   82     8  13
## 106    65     157  9.7   80     8  14
## 108    22      71 10.3   77     8  16
## 109    59      51  6.3   79     8  17
## 110    23     115  7.4   76     8  18
## 111    31     244 10.9   78     8  19
## 112    44     190 10.3   78     8  20
## 113    21     259 15.5   77     8  21
## 114     9      36 14.3   72     8  22
## 116    45     212  9.7   79     8  24
## 117   168     238  3.4   81     8  25
## 118    73     215  8.0   86     8  26
## 120    76     203  9.7   97     8  28
## 121   118     225  2.3   94     8  29
## 122    84     237  6.3   96     8  30
## 123    85     188  6.3   94     8  31
## 124    96     167  6.9   91     9   1
## 125    78     197  5.1   92     9   2
## 126    73     183  2.8   93     9   3
## 127    91     189  4.6   93     9   4
## 128    47      95  7.4   87     9   5
## 129    32      92 15.5   84     9   6
## 130    20     252 10.9   80     9   7
## 131    23     220 10.3   78     9   8
## 132    21     230 10.9   75     9   9
## 133    24     259  9.7   73     9  10
## 134    44     236 14.9   81     9  11
## 135    21     259 15.5   76     9  12
## 136    28     238  6.3   77     9  13
## 137     9      24 10.9   71     9  14
## 138    13     112 11.5   71     9  15
## 139    46     237  6.9   78     9  16
## 140    18     224 13.8   67     9  17
## 141    13      27 10.3   76     9  18
## 142    24     238 10.3   68     9  19
## 143    16     201  8.0   82     9  20
## 144    13     238 12.6   64     9  21
## 145    23      14  9.2   71     9  22
## 146    36     139 10.3   81     9  23
## 147     7      49 10.3   69     9  24
## 148    14      20 16.6   63     9  25
## 149    30     193  6.9   70     9  26
## 151    14     191 14.3   75     9  28
## 152    18     131  8.0   76     9  29
## 153    20     223 11.5   68     9  30
# 3. Verify:
head(airquality, 10)
##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10
head(airquality_new, 10)
##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 11     7      NA  6.9   74     5  11
## 12    16     256  9.7   69     5  12
# Run the code, this time, I did not include the "na.rm=TRUE"
df1 <- aggregate(airquality_new, by=list(airquality_new$Month), FUN=mean)
df1 %>%  
    select(  Month, Ozone)
##   Month    Ozone
## 1     5 23.61538
## 2     6 29.44444
## 3     7 59.11538
## 4     8 59.96154
## 5     9 31.44828
# The result is the same as method 1