Q3.1 Get Ebola data from the CDC.
The CDC provides data on the recent Ebola Outbreak
http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html Read the Ebola outbreak data and store as a nice data frame. Hint: You can try readHTMLTablele

Solution

setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
library(XML)
url_cdc<-"http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html"
Ebola_outbreak<- readHTMLTable(url_cdc)
Ebola_outbreak
## $`cases-widespread`
##        Country Total Cases (Suspected, Probable, and Confirmed)
## 1       Guinea                                             3584
## 2     Liberia*                                            10322
## 3 Sierra Leone                                            12371
## 4        Total                                            26277
##   Laboratory-Confirmed Cases Total Deaths
## 1                       3158         2377
## 2                       3151         4608
## 3                       8586         3899
## 4                      14895        10884
## 
## $`cases-travel-associated`
##                                         Country
## 1 No countries currently in this classification
## 2                                         Total
##   Total Cases (Suspected, Probable, and Confirmed)
## 1                                                0
## 2                                                0
##   Laboratory-Confirmed Cases Total Deaths
## 1                          0            0
## 2                          0            0
## 
## $`cases-localized-transmission`
##          Country Total Cases (Suspected, Probable, and Confirmed)
## 1        Nigeria                                               20
## 2        Senegal                                                1
## 3          Spain                                                1
## 4  United States                                                4
## 5           Mali                                                8
## 6 United Kingdom                                                1
## 7          Total                                               35
##   Laboratory-Confirmed Cases Total Deaths
## 1                         19            8
## 2                          1            0
## 3                          1            0
## 4                          4            1
## 5                          7            6
## 6                          1            0
## 7                         33           15

Q 3.2 Clean up the numbers (no asterisk) Some numbers in the data above have an asterisk. (19). Get rid of those ‘stars’**

Hints: try grep/gsub Hint: When searching for the “*" character (you have to use “\*" because the star is
a special character. Using the double backslash is called “escaping.” Example try this: gsub(“\“,”star“,”1234”) Hint: If a data frame column has odd characters, you can access it using the [row,
column] notation. For example, try df[ ,3] to print the third column

setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
 url_cdc<-"http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html"
 Ebola_outbreak<- readHTMLTable(url_cdc,header=T,sep=",")[[1]]
 Ebola_outbreak1<-readHTMLTable(url_cdc,header=T,sep=",")[[2]]
 Ebola_outbreak2<-readHTMLTable(url_cdc,header=T,sep=",")[[3]]
 dataframe <-rbind(Ebola_outbreak,Ebola_outbreak1,Ebola_outbreak2)
 df <- lapply(dataframe[2:4], FUN = function(x) as.numeric(gsub("\\*", "", x)))
 data.frame(df)
##    Total.Cases..Suspected..Probable..and.Confirmed.
## 1                                              3584
## 2                                             10322
## 3                                             12371
## 4                                             26277
## 5                                                 0
## 6                                                 0
## 7                                                20
## 8                                                 1
## 9                                                 1
## 10                                                4
## 11                                                8
## 12                                                1
## 13                                               35
##    Laboratory.Confirmed.Cases Total.Deaths
## 1                        3158         2377
## 2                        3151         4608
## 3                        8586         3899
## 4                       14895        10884
## 5                           0            0
## 6                           0            0
## 7                          19            8
## 8                           1            0
## 9                           1            0
## 10                          4            1
## 11                          7            6
## 12                          1            0
## 13                         33           15

Q3.3 Combine the 3 tables in the page into one dataframe.
Hint: Use rbind() rbind(table1, table2) You have remove the intermediate total lines. Keep only the rows for each country,
throw out the rest.

library(XML)
 url_cdc<-"http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html"
 Ebola_outbreak<- readHTMLTable(url_cdc,header=T,sep=",")[[1]]
 Ebola_outbreak1<-readHTMLTable(url_cdc,header=T,sep=",")[[2]]
 Ebola_outbreak2<-readHTMLTable(url_cdc,header=T,sep=",")[[3]]
 dataframe <-rbind(Ebola_outbreak,Ebola_outbreak1,Ebola_outbreak2)
 dataframe
##                                          Country
## 1                                         Guinea
## 2                                       Liberia*
## 3                                   Sierra Leone
## 4                                          Total
## 5  No countries currently in this classification
## 6                                          Total
## 7                                        Nigeria
## 8                                        Senegal
## 9                                          Spain
## 10                                 United States
## 11                                          Mali
## 12                                United Kingdom
## 13                                         Total
##    Total Cases (Suspected, Probable, and Confirmed)
## 1                                              3584
## 2                                             10322
## 3                                             12371
## 4                                             26277
## 5                                                 0
## 6                                                 0
## 7                                                20
## 8                                                 1
## 9                                                 1
## 10                                                4
## 11                                                8
## 12                                                1
## 13                                               35
##    Laboratory-Confirmed Cases Total Deaths
## 1                        3158         2377
## 2                        3151         4608
## 3                        8586         3899
## 4                       14895        10884
## 5                           0            0
## 6                           0            0
## 7                          19            8
## 8                           1            0
## 9                           1            0
## 10                          4            1
## 11                          7            6
## 12                          1            0
## 13                         33           15

Q3.4 Write an R Function Write an R function that will take the name of a country in West Africa and return
the “Total Cases” of Ebola in that country. Use the Ebola data frame you built for question 3.1.
Input: Country Name, Ebola data frame
Output: The corresponding value for the country for Total Cases of Ebola

library(XML)
url_cdc<-"http://www.cdc.gov/vhf/ebola/outbreaks/2014-west-africa/case-counts.html"
Ebola_outbreak<- readHTMLTable(url_cdc,header=T,sep=",")[[1]]
Ebola_outbreak1<-readHTMLTable(url_cdc,header=T,sep=",")[[2]]
Ebola_outbreak2<-readHTMLTable(url_cdc,header=T,sep=",")[[3]]
dataframe <-rbind(Ebola_outbreak,Ebola_outbreak1,Ebola_outbreak2)
dataframe
##                                          Country
## 1                                         Guinea
## 2                                       Liberia*
## 3                                   Sierra Leone
## 4                                          Total
## 5  No countries currently in this classification
## 6                                          Total
## 7                                        Nigeria
## 8                                        Senegal
## 9                                          Spain
## 10                                 United States
## 11                                          Mali
## 12                                United Kingdom
## 13                                         Total
##    Total Cases (Suspected, Probable, and Confirmed)
## 1                                              3584
## 2                                             10322
## 3                                             12371
## 4                                             26277
## 5                                                 0
## 6                                                 0
## 7                                                20
## 8                                                 1
## 9                                                 1
## 10                                                4
## 11                                                8
## 12                                                1
## 13                                               35
##    Laboratory-Confirmed Cases Total Deaths
## 1                        3158         2377
## 2                        3151         4608
## 3                        8586         3899
## 4                       14895        10884
## 5                           0            0
## 6                           0            0
## 7                          19            8
## 8                           1            0
## 9                           1            0
## 10                          4            1
## 11                          7            6
## 12                          1            0
## 13                         33           15
ebola1 <- lapply(dataframe[1:4], FUN = function(x)(gsub("\\*", "", x)))
df <- data.frame(ebola1)
cases_summary <- function(x){
if(x == "United States" || x == "Spain")
  {
print("Not a west African Country")
}
else
  {
   case <-subset(df,Country == x,select =c(Country,Total.Cases) )
   return(case)
    }
}

Q3.5 Reading from Wikipedia From the wiki page:
http://en.wikipedia.org/wiki/List_of_Ebola_outbreaks Find out how many humans have died of Ebola since 1976.

. Hint: Consider using readHTMLTable {XML package}.
. If a page has multiple tables, and you want the second table, you’d do [[2]]. . If a column name has spaces, you put the whole name inside quotes: - Example: df$“Human deaths” Print the total number of Human deaths from this table

Solution

setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
library(XML)
url_wikipage<-"http://en.wikipedia.org/wiki/List_of_Ebola_outbreaks"
wikipedia<-readHTMLTable(url_wikipage)[[2]]
condition <- (wikipedia$Year==1976)
(wikipedia$"Human deaths"[condition])
## NULL

Q3.6 dplyr Take the readymade dataset airquality Q: Calculate the average Ozone levels for each month in the data.
(Make sure you avoid the NA values!)

setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
sapply(1:length(airquality$Ozone),function(i) mean(airquality[airquality$Month==i, ]$Ozone, na.rm=TRUE))
##   [1]      NaN      NaN      NaN      NaN 23.61538 29.44444 59.11538
##   [8] 59.96154 31.44828      NaN      NaN      NaN      NaN      NaN
##  [15]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [22]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [29]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [36]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [43]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [50]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [57]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [64]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [71]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [78]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [85]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [92]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
##  [99]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [106]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [113]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [120]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [127]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [134]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [141]      NaN      NaN      NaN      NaN      NaN      NaN      NaN
## [148]      NaN      NaN      NaN      NaN      NaN      NaN