Setup

Install the package to reproduce the report here:

library(readr) #need this package to import the dataset

Data Description of Sars_2003

The SARS 2003 Outbreak Complete Dataset which was collected for my assignment purpose by Kaggle.com was downloaded from https://www.kaggle.com/imdevskp/sars-outbreak-2003-complete-dataset/download. The dataset includes 2538 observations and 5 variables and the details of these variables are given below:

Date reported: Date

Country effected: Characters

Cumulative number of case(s): Integer

Number of death: Integer

Number of recovered cases: Integer

Read/Import Data

#step 1: Importing the dataset Sars_2003.csv into Rstudio
Sars_2003 <- read_csv("Sars_2003.csv")
## Parsed with column specification:
## cols(
##   Date = col_date(format = ""),
##   Country = col_character(),
##   `Cumulative number of case(s)` = col_double(),
##   `Number of deaths` = col_double(),
##   `Number recovered` = col_double()
## )
#step 2: view the initial observation with headers
head(Sars_2003)

Inspect and Understand

#step 3: checking dimensions of data drame sars_2003
dim(Sars_2003)
## [1] 2538    5
#step 4: checking class of each variables:
class(Sars_2003$Date)
## [1] "Date"
class(Sars_2003$Country)
## [1] "character"
class(Sars_2003$`Cumulative number of case(s)`)
## [1] "numeric"
class(Sars_2003$`Number of deaths`)
## [1] "numeric"
class(Sars_2003$`Number recovered`)
## [1] "numeric"
#step 5: Checking the column names
colnames(Sars_2003)
## [1] "Date"                         "Country"                     
## [3] "Cumulative number of case(s)" "Number of deaths"            
## [5] "Number recovered"
#step 6: Checking structure of data frame:
str(Sars_2003)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2538 obs. of  5 variables:
##  $ Date                        : Date, format: "2003-03-17" "2003-03-17" ...
##  $ Country                     : chr  "Germany" "Canada" "Singapore" "Hong Kong SAR, China" ...
##  $ Cumulative number of case(s): num  1 8 20 95 2 1 40 2 8 0 ...
##  $ Number of deaths            : num  0 2 0 1 0 0 1 0 2 0 ...
##  $ Number recovered            : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_date(format = ""),
##   ..   Country = col_character(),
##   ..   `Cumulative number of case(s)` = col_double(),
##   ..   `Number of deaths` = col_double(),
##   ..   `Number recovered` = col_double()
##   .. )
#step 7: Displaying the data:
head(Sars_2003)

Subsetting I

#step 8: Subsetting data frame to produce intial 10 observation
Sars_2003_a <- Sars_2003[1:10,]
Sars_2003_a
#step 9: convert the data frame to matrix

sars_matrix <- as.matrix(Sars_2003_a)

#step 10: check structure and class of new matrix

str(sars_matrix)
##  chr [1:10, 1:5] "2003-03-17" "2003-03-17" "2003-03-17" "2003-03-17" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:5] "Date" "Country" "Cumulative number of case(s)" "Number of deaths" ...

Subsetting II

#step 11: subsetting data frame to comprise first and last variable only
sars_FL <- Sars_2003[,c(1,5)]
head(sars_FL)
#step 12: saving the data frame as R object file.
save(sars_FL, file="save_Fl.RData")

Create a new Data Frame

# step 13: crating new data frame with 2 variables and 10 observations:
Student_info <- data.frame(Student_ID= c(001,002,003,004,005,006,007,008,009,010), 
                           Grade= c("HD","HD","P","D","D,","HD","P","P","D","HD"))

Student_info
#Step 14: Factorising dataframe

Student_info$Grade <- factor(Student_info$Grade, levels = c("HD","D","P"),ordered = TRUE)
Student_info$Grade
##  [1] HD   HD   P    D    <NA> HD   P    P    D    HD  
## Levels: HD < D < P
#Step 15: Creating numeric variable

sem <- c(1L,4L,3L,2L,1L,4L,3L,6L,7L,5L)

#Step 13 : Adding the new variable to data frame


Student_info2 <- cbind(Student_info, sem)

#Step 16: Check structure, class, attribute and dimension of new data frame

class(Student_info2)
## [1] "data.frame"
str(Student_info2)
## 'data.frame':    10 obs. of  3 variables:
##  $ Student_ID: num  1 2 3 4 5 6 7 8 9 10
##  $ Grade     : Ord.factor w/ 3 levels "HD"<"D"<"P": 1 1 3 2 NA 1 3 3 2 1
##  $ sem       : int  1 4 3 2 1 4 3 6 7 5
dim(Student_info2)
## [1] 10  3
attributes(Student_info2)
## $names
## [1] "Student_ID" "Grade"      "sem"       
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10