Assignment Subsetting Data

Assignment for subsetting data

Question

Import the dataset into R environment
Make sure the variable names are Minggu and states from Johor to WP Labuan
Remove Variable name Malaysia
Rename WP Kuala Lumpur to ‘Kuala Lumpur’
Rename WP Labuan to ‘Labuan’
Remove Observation Total (entire row).
Add another column, as Year (make sure all observation in this year are same, eg: 2013).
Create another object for this dataset.
group the observations to quarterly (Quarter 1 - Quarter 4).(take summation)
Do the same process to year 2014, 2015, 2016, 20117 and 2018.
combine these dataset to one dataset. (suppose you should have 2 dataset (1 by week, another 1 is by quarter).
from quarter dataset, select only quarter 1 and quarter 3 for Perak.
from weekly dataset, select only Perak and Perlis.
from quarter dataset, find the total for the Peninsular Malaysia and East Malaysia based on quarter and year.
from weekly dataset, find the total for the Peninsular Malaysia and East Malaysia based on weekly and year.
from quarter dataset, find the total for Peninsular Malaysia and East Malaysia based on year.
from quarter dataset, find the total for Malaysia based on Quarter and year
From quarter dataset, find the total for Malaysia based on year.

Answer

Import the dataset into R environment

library(httr)
library(readxl)
url<- "http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/cd971423-e02e-415d-85f5-77c0118d242e/download/2013.xlsx"
GET(url, write_disk(tf<-tempfile(fileext = ".xlsx")))

## Response [http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/cd971423-e02e-415d-85f5-77c0118d242e/download/2013.xlsx]
##   Date: 2019-10-02 13:57
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 22.3 kB
## <ON DISK>  C:\Users\Acer\AppData\Local\Temp\RtmpWOxTGI\file57d460b44b2d.xlsx

data2013 <- read_excel(tf)

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 11 more problems

Make sure the variable names are Minggu and states from Johor to WP Labuan

names(data2013) <-lapply(data2013[2,], as.character)

## Warning: Must use a character vector as names.
## This warning is displayed once per session.

data2013 <- data2013[-c(1,2),]

Remove Variable name Malaysia

data2013$Malaysia <- NULL

Rename WP Kuala Lumpur to ‘Kuala Lumpur’

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data2013 <- rename(data2013,
                   "Kuala Lumpur" = "WP Kuala Lumpur")

Rename WP Labuan to ‘Labuan’

library(dplyr)
data2013 <- rename(data2013, "Labuan" = "WP Labuan")

Remove Observation Total (entire row).

data2013 <- data2013[-53,]

Add another column, as Year (make sure all observation in this year are same, eg: 2013).

data2013$Year <- 2013

Create another object for this dataset.

#order year to be first variable in dataset
data2013 <- select(data2013, Year, everything())


data2013[1:17] <- sapply(data2013[1:17], 
                         as.numeric)

group the observations to quarterly (Quarter 1 - Quarter 4).(take summation)

quarter2013 <- 
  data2013%>%
  mutate(quarter = ifelse(`Epid Minggu` <= 13, "quarter 1",
                          ifelse (`Epid Minggu` <= 26, "quarter 2",
                                  ifelse(`Epid Minggu` <= 39, "quarter 3", "quarter 4"))))

Do the same process to year 2014, 2015, 2016, 2017 and 2018.

#2014
library(httr)
library(readxl)
url<- "http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/76ed6414-c5cb-4c87-9d48-4b4bc1185bdb/download/2014.xlsx"
GET(url, write_disk(tf<-tempfile(fileext = ".xlsx")))

## Response [http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/76ed6414-c5cb-4c87-9d48-4b4bc1185bdb/download/2014.xlsx]
##   Date: 2019-10-02 13:57
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 22.4 kB
## <ON DISK>  C:\Users\Acer\AppData\Local\Temp\RtmpWOxTGI\file57d44dfbb5f.xlsx

data2014 <- read_excel(tf)

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 11 more problems

names(data2014) <-lapply(data2014[2,], as.character)

data2014 <- data2014[-c(1,2),]


data2014$Malaysia <- NULL



library(dplyr)
data2014 <- rename(data2014,
                   "Kuala Lumpur" = "WP Kuala Lumpur")
                   
                   
library(dplyr)
data2014 <- rename(data2014, "Labuan" = "WP Labuan")


data2014 <- data2014[-53,]


data2014$Year <- 2014


#order year to be first variable in dataset
data2014 <- select(data2014, Year, everything())


data2014[1:17] <- sapply(data2014[1:17], 
                         as.numeric)

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

quarter2014 <- 
  data2014%>%
  mutate(quarter = ifelse(`Epid Minggu` <= 14, "quarter 1",
                          ifelse (`Epid Minggu` <= 27, "quarter 2",
                                  ifelse(`Epid Minggu` <= 40, "quarter 3", "quarter 4"))))  


#2015

library(httr)
library(readxl)
url<- "http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/3ca80408-74cd-4403-8f97-d11e402ddb3d/download/2015.xlsx"
GET(url, write_disk(tf<-tempfile(fileext = ".xlsx")))

## Response [http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/3ca80408-74cd-4403-8f97-d11e402ddb3d/download/2015.xlsx]
##   Date: 2019-10-02 13:57
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 22.3 kB
## <ON DISK>  C:\Users\Acer\AppData\Local\Temp\RtmpWOxTGI\file57d471446b9d.xlsx

data2015 <- read_excel(tf)

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 11 more problems

names(data2015) <-lapply(data2015[2,], as.character)

data2015 <- data2015[-c(1,2),]


data2015$Malaysia <- NULL


library(dplyr)
data2015 <- rename(data2015,
                   "Kuala Lumpur" = "WP Kuala Lumpur")
   
   
                   library(dplyr)
data2015 <- rename(data2015, "Labuan" = "WP Labuan")


data2015 <- data2015[-53,]


data2015$Year <- 2015


#order year to be first variable in dataset
data2015 <- select(data2015, Year, everything())


data2015[1:17] <- sapply(data2015[1:17], 
                         as.numeric)
                         
quarter2015 <- 
  data2015%>%
  mutate(quarter = ifelse(`Epid Minggu` <= 13, "quarter 1",
                          ifelse (`Epid Minggu` <= 26, "quarter 2",
                                  ifelse(`Epid Minggu` <= 39, "quarter 3", "quarter 4")))) 
                                  
#2016

library(httr)
library(readxl)
url<- "http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/40f38fee-e376-45dc-af96-067c588e9e4d/download/2016.xlsx"
GET(url, write_disk(tf<-tempfile(fileext = ".xlsx")))

## Response [http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/40f38fee-e376-45dc-af96-067c588e9e4d/download/2016.xlsx]
##   Date: 2019-10-02 13:57
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 22.2 kB
## <ON DISK>  C:\Users\Acer\AppData\Local\Temp\RtmpWOxTGI\file57d443d757dd.xlsx

data2016 <- read_excel(tf)

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 11 more problems

names(data2016) <-lapply(data2016[2,], as.character)

data2016 <- data2016[-c(1,2),]



data2016$Malaysia <- NULL


library(dplyr)
data2016 <- rename(data2016,
                   "Kuala Lumpur" = "WP Kuala Lumpur")
         
         
                   library(dplyr)
data2016 <- rename(data2016, "Labuan" = "WP Labuan")


data2016 <- data2016[-53,]


data2016$Year <- 2016


#order year to be first variable in dataset
data2016 <- select(data2016, Year, everything())


data2016[1:17] <- sapply(data2016[1:17], 
                         as.numeric)
                         
quarter2016 <- 
  data2016%>%
  mutate(quarter = ifelse(`Epid Minggu` <= 13, "quarter 1",
                          ifelse (`Epid Minggu` <= 26, "quarter 2",
                                  ifelse(`Epid Minggu` <= 39, "quarter 3", "quarter 4")))) 
                                  
#2017


library(httr)
library(readxl)
url<- "http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/66a5e3d0-55cd-4096-a701-e134b91b104d/download/2017.xlsx"
GET(url, write_disk(tf<-tempfile(fileext = ".xlsx")))

## Response [http://www.data.gov.my/data/ms_MY/dataset/bbd0dfba-b586-47ae-91ea-6f0b7880d25b/resource/66a5e3d0-55cd-4096-a701-e134b91b104d/download/2017.xlsx]
##   Date: 2019-10-02 13:57
##   Status: 200
##   Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
##   Size: 22.4 kB
## <ON DISK>  C:\Users\Acer\AppData\Local\Temp\RtmpWOxTGI\file57d42a84d4e.xlsx

data2017 <- read_excel(tf)

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ... and 11 more problems

names(data2017) <-lapply(data2017[2,], as.character)

data2017 <- data2017[-c(1,2),]



data2017$Malaysia <- NULL



library(dplyr)
data2017 <- rename(data2017,
                   "Kuala Lumpur" = "WP Kuala Lumpur")
        
        
library(dplyr)
data2017 <- rename(data2017, "Labuan" = "WP Labuan")


data2017 <- data2017[-53,]


data2017$Year <- 2017


#order year to be first variable in dataset
data2017 <- select(data2017, Year, everything())


data2017[1:17] <- sapply(data2017[1:17], 
                         as.numeric)
                         
quarter2017 <- 
  data2017%>%
  mutate(quarter = ifelse(`Epid Minggu` <= 13, "quarter 1",
                          ifelse (`Epid Minggu` <= 26, "quarter 2",
                                  ifelse(`Epid Minggu` <= 39, "quarter 3", "quarter 4"))))

combine these dataset to one dataset. (suppose you should have 2 dataset (1 by week, another 1 is by quarter).

#combine by quarter
combine <- rbind(quarter2013, quarter2014,
                 quarter2015, quarter2016,
                 quarter2017)
        
        
#combine by week
combine_week <- rbind(data2013, data2014,data2015, data2016, data2017)

combine <- select(combine, Year,`Epid Minggu`, quarter, everything())

from quarter dataset, select only quarter 1 and quarter 3 for Perak.

group_by(combine, quarter) %>%
  filter(quarter == "Quarter 1" | quarter == "Quarter 3",
         Year == 2013) %>%
  summarise(Perak_case = sum(Perak))

## # A tibble: 0 x 2
## # ... with 2 variables: quarter <chr>, Perak_case <dbl>

from weekly dataset, select only Perak and Perlis.

group_by(combine, quarter) %>%
    filter(Year == 2014) %>%
    summarise(Perak_case = sum(Perak),
              Perlis_case = sum(Perlis))

## # A tibble: 5 x 3
##   quarter   Perak_case Perlis_case
##   <chr>          <dbl>       <dbl>
## 1 quarter 1        442          32
## 2 quarter 2        376          33
## 3 quarter 3        374          29
## 4 quarter 4        341          23
## 5 <NA>            1611         119

from quarter dataset, find the total for the Peninsular Malaysia and East Malaysia based on quarter and year.

#peninsular Malaysia
group_by(combine,quarter, Year) %>%
 summarise(Johor_case = sum(Johor),
            Kedah_case = sum(Kedah),
            Kelantan_case = sum(Kelantan),
            Melaka_case = sum(Melaka),
            NegeriSembilan_case = sum(`Negeri Sembilan`),
            Pahang_case = sum(Pahang),
            Perak_case = sum(Perak),
            Perlis_case = sum(Perlis),
            PulauPinang_case = sum(`Pulau Pinang`),
            Selangor_case = sum(Selangor),
            Terengganu_case = sum(Terengganu),
            KualaLumpur_case = sum(`Kuala Lumpur`))

## # A tibble: 21 x 14
## # Groups:   quarter [5]
##    quarter  Year Johor_case Kedah_case Kelantan_case Melaka_case
##    <chr>   <dbl>      <dbl>      <dbl>         <dbl>       <dbl>
##  1 quarte~  2013        508        277           341         131
##  2 quarte~  2014        613        308           423         151
##  3 quarte~  2015        580        294           317         113
##  4 quarte~  2016        534        310           337         127
##  5 quarte~  2017        483        322           268         148
##  6 quarte~  2013        499        272           378         138
##  7 quarte~  2014        508        312           317         147
##  8 quarte~  2015        668        324           323         137
##  9 quarte~  2016        540        322           319         163
## 10 quarte~  2017        560        377           305         155
## # ... with 11 more rows, and 8 more variables: NegeriSembilan_case <dbl>,
## #   Pahang_case <dbl>, Perak_case <dbl>, Perlis_case <dbl>,
## #   PulauPinang_case <dbl>, Selangor_case <dbl>, Terengganu_case <dbl>,
## #   KualaLumpur_case <dbl>

#East Malaysia
group_by(combine,quarter, Year) %>%
 summarise(Sabah_case = sum(Sabah),
            Sarawak_case = sum(Sarawak),
            Labuan_case = sum(Labuan))

## # A tibble: 21 x 5
## # Groups:   quarter [5]
##    quarter    Year Sabah_case Sarawak_case Labuan_case
##    <chr>     <dbl>      <dbl>        <dbl>       <dbl>
##  1 quarter 1  2013       1144          677          18
##  2 quarter 1  2014       1173          802          25
##  3 quarter 1  2015       1048          604          25
##  4 quarter 1  2016       1215          618          28
##  5 quarter 1  2017       1135          694          27
##  6 quarter 2  2013       1120          600          30
##  7 quarter 2  2014       1199          627          24
##  8 quarter 2  2015       1066          612          26
##  9 quarter 2  2016       1196          733          35
## 10 quarter 2  2017       1231          671          36
## # ... with 11 more rows

from weekly dataset, find the total for the Peninsular Malaysia and East Malaysia based on weekly and year.

#peninsular Malaysia
group_by(combine_week, 'Epid Minggu', Year) %>%
 summarise(Johor_case = sum(Johor),
            Kedah_case = sum(Kedah),
            Kelantan_case = sum(Kelantan),
            Melaka_case = sum(Melaka),
            NegeriSembilan_case = sum(`Negeri Sembilan`),
            Pahang_case = sum(Pahang),
            Perak_case = sum(Perak),
            Perlis_case = sum(Perlis),
            PulauPinang_case = sum(`Pulau Pinang`),
            Selangor_case = sum(Selangor),
            Terengganu_case = sum(Terengganu),
            KualaLumpur_case = sum(`Kuala Lumpur`))

## # A tibble: 5 x 14
## # Groups:   "Epid Minggu" [1]
##   `"Epid Minggu"`  Year Johor_case Kedah_case Kelantan_case Melaka_case
##   <chr>           <dbl>      <dbl>      <dbl>         <dbl>       <dbl>
## 1 Epid Minggu      2013       2246       1169          1402         552
## 2 Epid Minggu      2014       4568       2551          2717        1302
## 3 Epid Minggu      2015       2409       1279          1233         513
## 4 Epid Minggu      2016       2325       1283          1254         571
## 5 Epid Minggu      2017       2251       1417          1300         639
## # ... with 8 more variables: NegeriSembilan_case <dbl>, Pahang_case <dbl>,
## #   Perak_case <dbl>, Perlis_case <dbl>, PulauPinang_case <dbl>,
## #   Selangor_case <dbl>, Terengganu_case <dbl>, KualaLumpur_case <dbl>

#East Malaysia
group_by(combine_week, 'Epid Minggu',  Year) %>%
 summarise(Sabah_case = sum(Sabah),
            Sarawak_case = sum(Sarawak),
            Labuan_case = sum(Labuan))

## # A tibble: 5 x 5
## # Groups:   "Epid Minggu" [1]
##   `"Epid Minggu"`  Year Sabah_case Sarawak_case Labuan_case
##   <chr>           <dbl>      <dbl>        <dbl>       <dbl>
## 1 Epid Minggu      2013       4526         2672          99
## 2 Epid Minggu      2014       9379         5515         208
## 3 Epid Minggu      2015       4464         2575         116
## 4 Epid Minggu      2016       4953         2857         133
## 5 Epid Minggu      2017       5106         2797         137

from quarter dataset, find the total for Peninsular Malaysia and East Malaysia based on year.

#peninsular Malaysia
group_by(combine, Year) %>%
 summarise(Johor_case = sum(Johor),
            Kedah_case = sum(Kedah),
            Kelantan_case = sum(Kelantan),
            Melaka_case = sum(Melaka),
            NegeriSembilan_case = sum(`Negeri Sembilan`),
            Pahang_case = sum(Pahang),
            Perak_case = sum(Perak),
            Perlis_case = sum(Perlis),
            PulauPinang_case = sum(`Pulau Pinang`),
            Selangor_case = sum(Selangor),
            Terengganu_case = sum(Terengganu),
            KualaLumpur_case = sum(`Kuala Lumpur`))

## # A tibble: 5 x 13
##    Year Johor_case Kedah_case Kelantan_case Melaka_case NegeriSembilan_~
##   <dbl>      <dbl>      <dbl>         <dbl>       <dbl>            <dbl>
## 1  2013       2246       1169          1402         552              631
## 2  2014       4568       2551          2717        1302             1309
## 3  2015       2409       1279          1233         513              667
## 4  2016       2325       1283          1254         571              641
## 5  2017       2251       1417          1300         639              631
## # ... with 7 more variables: Pahang_case <dbl>, Perak_case <dbl>,
## #   Perlis_case <dbl>, PulauPinang_case <dbl>, Selangor_case <dbl>,
## #   Terengganu_case <dbl>, KualaLumpur_case <dbl>

#East Malaysia
group_by(combine, Year) %>%
 summarise(Sabah_case = sum(Sabah),
            Sarawak_case = sum(Sarawak),
            Labuan_case = sum(Labuan))

## # A tibble: 5 x 4
##    Year Sabah_case Sarawak_case Labuan_case
##   <dbl>      <dbl>        <dbl>       <dbl>
## 1  2013       4526         2672          99
## 2  2014       9379         5515         208
## 3  2015       4464         2575         116
## 4  2016       4953         2857         133
## 5  2017       5106         2797         137

from quarter dataset, find the total for Malaysia based on Quarter and year

#Malaysia
group_by(combine, quarter, Year) %>%
summarise(Johor_case = sum(Johor),
            Kedah_case = sum(Kedah),
            Kelantan_case = sum(Kelantan),
            Melaka_case = sum(Melaka),
            NegeriSembilan_case = sum(`Negeri Sembilan`),
            Pahang_case = sum(Pahang),
            Perak_case = sum(Perak),
            Perlis_case = sum(Perlis),
            PulauPinang_case = sum(`Pulau Pinang`),
            Selangor_case = sum(Selangor),
            Terengganu_case = sum(Terengganu),
            KualaLumpur_case = sum(`Kuala Lumpur`),
            Sabah_case = sum(Sabah),
            Sarawak_case = sum(Sarawak),
            Labuan_case = sum(Labuan))

## # A tibble: 21 x 17
## # Groups:   quarter [5]
##    quarter  Year Johor_case Kedah_case Kelantan_case Melaka_case
##    <chr>   <dbl>      <dbl>      <dbl>         <dbl>       <dbl>
##  1 quarte~  2013        508        277           341         131
##  2 quarte~  2014        613        308           423         151
##  3 quarte~  2015        580        294           317         113
##  4 quarte~  2016        534        310           337         127
##  5 quarte~  2017        483        322           268         148
##  6 quarte~  2013        499        272           378         138
##  7 quarte~  2014        508        312           317         147
##  8 quarte~  2015        668        324           323         137
##  9 quarte~  2016        540        322           319         163
## 10 quarte~  2017        560        377           305         155
## # ... with 11 more rows, and 11 more variables: NegeriSembilan_case <dbl>,
## #   Pahang_case <dbl>, Perak_case <dbl>, Perlis_case <dbl>,
## #   PulauPinang_case <dbl>, Selangor_case <dbl>, Terengganu_case <dbl>,
## #   KualaLumpur_case <dbl>, Sabah_case <dbl>, Sarawak_case <dbl>,
## #   Labuan_case <dbl>

From quarter dataset, find the total for Malaysia based on year.

group_by(combine, Year) %>%
summarise(Johor_case = sum(Johor),
            Kedah_case = sum(Kedah),
            Kelantan_case = sum(Kelantan),
            Melaka_case = sum(Melaka),
            NegeriSembilan_case = sum(`Negeri Sembilan`),
            Pahang_case = sum(Pahang),
            Perak_case = sum(Perak),
            Perlis_case = sum(Perlis),
            PulauPinang_case = sum(`Pulau Pinang`),
            Selangor_case = sum(Selangor),
            Terengganu_case = sum(Terengganu),
            KualaLumpur_case = sum(`Kuala Lumpur`),
            Sabah_case = sum(Sabah),
            Sarawak_case = sum(Sarawak),
            Labuan_case = sum(Labuan))

## # A tibble: 5 x 16
##    Year Johor_case Kedah_case Kelantan_case Melaka_case NegeriSembilan_~
##   <dbl>      <dbl>      <dbl>         <dbl>       <dbl>            <dbl>
## 1  2013       2246       1169          1402         552              631
## 2  2014       4568       2551          2717        1302             1309
## 3  2015       2409       1279          1233         513              667
## 4  2016       2325       1283          1254         571              641
## 5  2017       2251       1417          1300         639              631
## # ... with 10 more variables: Pahang_case <dbl>, Perak_case <dbl>,
## #   Perlis_case <dbl>, PulauPinang_case <dbl>, Selangor_case <dbl>,
## #   Terengganu_case <dbl>, KualaLumpur_case <dbl>, Sabah_case <dbl>,
## #   Sarawak_case <dbl>, Labuan_case <dbl>

Assignment Subsetting Data

Syarah Hashim (2017210184)

10/2/2019

Assignment for subsetting data

Question

Answer