Environment Settings

Not all packages are in one chunk code due to conflicts with lubridate either with VIM, mice or misforest package.

library(tidyverse)
library(reshape2)
library(lubridate)
library(gt)
library(glue)
library(paletteer)
library(scales)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.1.0    paletteer_1.1.0 glue_1.3.2      gt_0.2.0.5     
##  [5] lubridate_1.7.8 reshape2_1.4.3  forcats_0.5.0   stringr_1.4.0  
##  [9] dplyr_0.8.5     purrr_0.3.3     readr_1.3.1     tidyr_1.0.2    
## [13] tibble_3.0.0    ggplot2_3.3.0   tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.4        lattice_0.20-40   assertthat_0.2.1  digest_0.6.25    
##  [5] R6_2.4.1          cellranger_1.1.0  plyr_1.8.6        backports_1.1.5  
##  [9] reprex_0.3.0      oompaBase_3.2.9   evaluate_0.14     httr_1.4.1       
## [13] pillar_1.4.3      rlang_0.4.5       readxl_1.3.1      rstudioapi_0.11  
## [17] rmarkdown_2.1     munsell_0.5.0     broom_0.5.5       compiler_3.6.3   
## [21] modelr_0.1.6      xfun_0.12         pkgconfig_2.0.3   scico_1.1.0      
## [25] htmltools_0.4.0   tidyselect_1.0.0  viridisLite_0.3.0 fansi_0.4.1      
## [29] crayon_1.3.4      dbplyr_1.4.2      withr_2.1.2       jcolors_0.0.4    
## [33] grid_3.6.3        nlme_3.1-145      jsonlite_1.6.1    gtable_0.3.0     
## [37] lifecycle_0.2.0   DBI_1.1.0         magrittr_1.5      palr_0.2.0       
## [41] pals_1.6          cli_2.0.2         stringi_1.4.6     mapproj_1.2.7    
## [45] fs_1.3.2          xml2_1.2.5        ellipsis_0.3.0    generics_0.0.2   
## [49] vctrs_0.2.4       rematch2_2.1.1    tools_3.6.3       dichromat_2.0-0  
## [53] maps_3.3.0        hms_0.5.3         yaml_2.2.1        colorspace_1.4-1 
## [57] cluster_2.1.0     rvest_0.3.5       knitr_1.28        haven_2.2.0

Load Datasets

fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
fname <- c("./data/pamd.zip")

source("LoadUnzip.R")
dateDownloaded
# LoadUnzip.R = if(!file.exists("data")){
#         dir.create("data")
# }
# 
# download.file(fileUrl,destfile="./data/pamd.zip",method="curl")
# 
# dateDownloaded <- date()
# 
# maindf <- read_delim(fname, delim = ",", col_names = TRUE)

Dataset download date Thu Apr 16 20:38:47 2020.

Range of Date

start_date <- min(maindf$date)
end_date <- max(maindf$date)
range(maindf$date)
## [1] "2012-10-01" "2012-11-30"

What is mean total number of steps taken per day?

For this part of the assignment, you can ignore the missing values in the dataset.

  1. Calculate the total number of steps taken per day? Answer: MeanTotal gives us the breakdown of steps each day.

  2. If you do not understand the difference between a histogram and a barplot, research the difference between them. Make a histogram of the total number of steps taken each day

  3. Calculate and report the mean and median of the total number of steps taken per day

GTMeanTotal is using the (gt) grammar for tables, scale, glue, and paletteer package. The yellow hightlighted portion gives as the Total steps - mean and median for 53 observations (I used drop_na from dplyr package to remove the NA).

MeanTotal <- maindf %>% drop_na() %>% group_by(date) %>%  summarize(Steps = sum(steps)) %>% mutate(m=month(date))
head(MeanTotal)
## # A tibble: 6 x 3
##   date       Steps     m
##   <date>     <dbl> <dbl>
## 1 2012-10-02   126    10
## 2 2012-10-03 11352    10
## 3 2012-10-04 12116    10
## 4 2012-10-05 13294    10
## 5 2012-10-06 15420    10
## 6 2012-10-07 11015    10

Mean and Median

StepsMeanMed <- MeanTotal %>% summarize(Obs = length(date), SumofSteps = sum(Steps), StepsMean = mean(Steps), StepsMedian = median(Steps) ) 

GTMeanTotal <- StepsMeanMed %>% gt::gt() %>% data_color(
        columns = vars(Obs,SumofSteps, StepsMean,StepsMedian),
        colors = c("yellow")) %>% 
        tab_header(title = "Step Mean and Median Summary",
        subtitle = glue("{start_date} to {end_date}"))

GTMeanTotal
Step Mean and Median Summary
2012-10-01 to 2012-11-30
Obs SumofSteps StepsMean StepsMedian
53 570608 10766.19 10765

Histogram of Steps taken Each Day

g1<-ggplot(data = MeanTotal, aes(x=Steps,fill=factor(m)))
gg1 <- g1+geom_histogram(bins = 9, alpha=.5)+geom_vline(xintercept = mean(MeanTotal$Steps))+
  labs(title="Personal Movement Activity Monitoring Device", 
         subtitle="Total Steps Each Day, Observation = 53 days",
         x="STEPS",
         y="FREQUENCY",
         fill="MONTH")
gg1

What is the average daily activity pattern?

I’m using the gt (grammar for tables) package to analyze the average of steps in 5 minute interval. This table has 288 observations - see below:.

SummaryIntervalSteps <- maindf %>% drop_na() %>% group_by(TimeInterval=interval)%>% 
        summarize(Total_Obs = length(interval), 
        SumOfSteps = sum(steps),
        MeanOfSteps = mean(steps)) 

GTInterval <- SummaryIntervalSteps %>% gt::gt() %>% data_color(
    columns = vars(SumOfSteps, MeanOfSteps),
    colors = scales::col_numeric(
      palette = paletteer::paletteer_d(
        palette = "ggsci::red_material"
        ) %>% as.character(),
      domain = NULL
      )
  ) %>% tab_header(title = "Personal Movement Activity Monitoring Device",
        subtitle = glue("{start_date} to {end_date}"))
    
GTInterval    
Personal Movement Activity Monitoring Device
2012-10-01 to 2012-11-30
TimeInterval Total_Obs SumOfSteps MeanOfSteps
0 53 91 1.7169811
5 53 18 0.3396226
10 53 7 0.1320755
15 53 8 0.1509434
20 53 4 0.0754717
25 53 111 2.0943396
30 53 28 0.5283019
35 53 46 0.8679245
40 53 0 0.0000000
45 53 78 1.4716981
50 53 16 0.3018868
55 53 7 0.1320755
100 53 17 0.3207547
105 53 36 0.6792453
110 53 8 0.1509434
115 53 18 0.3396226
120 53 0 0.0000000
125 53 59 1.1132075
130 53 97 1.8301887
135 53 9 0.1698113
140 53 9 0.1698113
145 53 20 0.3773585
150 53 14 0.2641509
155 53 0 0.0000000
200 53 0 0.0000000
205 53 0 0.0000000
210 53 60 1.1320755
215 53 0 0.0000000
220 53 0 0.0000000
225 53 7 0.1320755
230 53 0 0.0000000
235 53 12 0.2264151
240 53 0 0.0000000
245 53 0 0.0000000
250 53 82 1.5471698
255 53 50 0.9433962
300 53 0 0.0000000
305 53 0 0.0000000
310 53 0 0.0000000
315 53 0 0.0000000
320 53 11 0.2075472
325 53 33 0.6226415
330 53 86 1.6226415
335 53 31 0.5849057
340 53 26 0.4905660
345 53 4 0.0754717
350 53 0 0.0000000
355 53 0 0.0000000
400 53 63 1.1886792
405 53 50 0.9433962
410 53 136 2.5660377
415 53 0 0.0000000
420 53 18 0.3396226
425 53 19 0.3584906
430 53 218 4.1132075
435 53 35 0.6603774
440 53 185 3.4905660
445 53 44 0.8301887
450 53 165 3.1132075
455 53 59 1.1132075
500 53 0 0.0000000
505 53 83 1.5660377
510 53 159 3.0000000
515 53 119 2.2452830
520 53 176 3.3207547
525 53 157 2.9622642
530 53 111 2.0943396
535 53 321 6.0566038
540 53 849 16.0188679
545 53 972 18.3396226
550 53 2091 39.4528302
555 53 2358 44.4905660
600 53 1669 31.4905660
605 53 2611 49.2641509
610 53 2850 53.7735849
615 53 3363 63.4528302
620 53 2648 49.9622642
625 53 2495 47.0754717
630 53 2764 52.1509434
635 53 2085 39.3396226
640 53 2333 44.0188679
645 53 2341 44.1698113
650 53 1980 37.3584906
655 53 2599 49.0377358
700 53 2322 43.8113208
705 53 2352 44.3773585
710 53 2677 50.5094340
715 53 2889 54.5094340
720 53 2646 49.9245283
725 53 2702 50.9811321
730 53 2951 55.6792453
735 53 2349 44.3207547
740 53 2770 52.2641509
745 53 3686 69.5471698
750 53 3066 57.8490566
755 53 2976 56.1509434
800 53 3889 73.3773585
805 53 3615 68.2075472
810 53 6860 129.4339623
815 53 8349 157.5283019
820 53 9071 171.1509434
825 53 8236 155.3962264
830 53 9397 177.3018868
835 53 10927 206.1698113
840 53 10384 195.9245283
845 53 9517 179.5660377
850 53 9720 183.3962264
855 53 8852 167.0188679
900 53 7603 143.4528302
905 53 6574 124.0377358
910 53 5783 109.1132075
915 53 5730 108.1132075
920 53 5497 103.7169811
925 53 5086 95.9622642
930 53 3509 66.2075472
935 53 2397 45.2264151
940 53 1314 24.7924528
945 53 2054 38.7547170
950 53 1854 34.9811321
955 53 1116 21.0566038
1000 53 2150 40.5660377
1005 53 1430 26.9811321
1010 53 2248 42.4150943
1015 53 2791 52.6603774
1020 53 2063 38.9245283
1025 53 2692 50.7924528
1030 53 2347 44.2830189
1035 53 1983 37.4150943
1040 53 1839 34.6981132
1045 53 1502 28.3396226
1050 53 1330 25.0943396
1055 53 1693 31.9433962
1100 53 1662 31.3584906
1105 53 1573 29.6792453
1110 53 1130 21.3207547
1115 53 1354 25.5471698
1120 53 1504 28.3773585
1125 53 1403 26.4716981
1130 53 1772 33.4339623
1135 53 2649 49.9811321
1140 53 2228 42.0377358
1145 53 2364 44.6037736
1150 53 2440 46.0377358
1155 53 3137 59.1886792
1200 53 3385 63.8679245
1205 53 4648 87.6981132
1210 53 5027 94.8490566
1215 53 4917 92.7735849
1220 53 3360 63.3962264
1225 53 2659 50.1698113
1230 53 2887 54.4716981
1235 53 1718 32.4150943
1240 53 1406 26.5283019
1245 53 2000 37.7358491
1250 53 2388 45.0566038
1255 53 3566 67.2830189
1300 53 2244 42.3396226
1305 53 2114 39.8867925
1310 53 2293 43.2641509
1315 53 2172 40.9811321
1320 53 2451 46.2452830
1325 53 2991 56.4339623
1330 53 2266 42.7547170
1335 53 1332 25.1320755
1340 53 2118 39.9622642
1345 53 2838 53.5471698
1350 53 2508 47.3207547
1355 53 3223 60.8113208
1400 53 2955 55.7547170
1405 53 2754 51.9622642
1410 53 2310 43.5849057
1415 53 2581 48.6981132
1420 53 1880 35.4716981
1425 53 1990 37.5471698
1430 53 2218 41.8490566
1435 53 1458 27.5094340
1440 53 907 17.1132075
1445 53 1382 26.0754717
1450 53 2312 43.6226415
1455 53 2320 43.7735849
1500 53 1591 30.0188679
1505 53 1912 36.0754717
1510 53 1881 35.4905660
1515 53 2059 38.8490566
1520 53 2436 45.9622642
1525 53 2531 47.7547170
1530 53 2551 48.1320755
1535 53 3462 65.3207547
1540 53 4394 82.9056604
1545 53 5229 98.6603774
1550 53 5412 102.1132075
1555 53 4450 83.9622642
1600 53 3293 62.1320755
1605 53 3399 64.1320755
1610 53 3951 74.5471698
1615 53 3348 63.1698113
1620 53 3016 56.9056604
1625 53 3168 59.7735849
1630 53 2325 43.8679245
1635 53 2044 38.5660377
1640 53 2367 44.6603774
1645 53 2409 45.4528302
1650 53 2449 46.2075472
1655 53 2315 43.6792453
1700 53 2471 46.6226415
1705 53 2984 56.3018868
1710 53 2688 50.7169811
1715 53 3245 61.2264151
1720 53 3854 72.7169811
1725 53 4184 78.9433962
1730 53 3654 68.9433962
1735 53 3162 59.6603774
1740 53 3980 75.0943396
1745 53 2995 56.5094340
1750 53 1843 34.7735849
1755 53 1985 37.4528302
1800 53 2156 40.6792453
1805 53 3075 58.0188679
1810 53 3959 74.6981132
1815 53 4522 85.3207547
1820 53 3141 59.2641509
1825 53 3592 67.7735849
1830 53 4118 77.6981132
1835 53 3935 74.2452830
1840 53 4523 85.3396226
1845 53 5271 99.4528302
1850 53 4589 86.5849057
1855 53 4537 85.6037736
1900 53 4498 84.8679245
1905 53 4125 77.8301887
1910 53 3076 58.0377358
1915 53 2828 53.3584906
1920 53 1925 36.3207547
1925 53 1098 20.7169811
1930 53 1452 27.3962264
1935 53 2121 40.0188679
1940 53 1601 30.2075472
1945 53 1354 25.5471698
1950 53 2420 45.6603774
1955 53 1777 33.5283019
2000 53 1040 19.6226415
2005 53 1008 19.0188679
2010 53 1025 19.3396226
2015 53 1767 33.3396226
2020 53 1421 26.8113208
2025 53 1122 21.1698113
2030 53 1447 27.3018868
2035 53 1131 21.3396226
2040 53 1036 19.5471698
2045 53 1130 21.3207547
2050 53 1712 32.3018868
2055 53 1068 20.1509434
2100 53 845 15.9433962
2105 53 913 17.2264151
2110 53 1243 23.4528302
2115 53 1020 19.2452830
2120 53 660 12.4528302
2125 53 425 8.0188679
2130 53 777 14.6603774
2135 53 864 16.3018868
2140 53 460 8.6792453
2145 53 413 7.7924528
2150 53 431 8.1320755
2155 53 139 2.6226415
2200 53 77 1.4528302
2205 53 195 3.6792453
2210 53 255 4.8113208
2215 53 451 8.5094340
2220 53 375 7.0754717
2225 53 461 8.6981132
2230 53 517 9.7547170
2235 53 117 2.2075472
2240 53 17 0.3207547
2245 53 6 0.1132075
2250 53 85 1.6037736
2255 53 244 4.6037736
2300 53 175 3.3018868
2305 53 151 2.8490566
2310 53 0 0.0000000
2315 53 44 0.8301887
2320 53 51 0.9622642
2325 53 84 1.5849057
2330 53 138 2.6037736
2335 53 249 4.6981132
2340 53 175 3.3018868
2345 53 34 0.6415094
2350 53 12 0.2264151
2355 53 57 1.0754717

Timeseries

  1. Make a time series plot (i.e. type=“l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
g2<-ggplot(data = SummaryIntervalSteps, aes(y=MeanOfSteps, x=TimeInterval))
g2+geom_line(color="red")+
        labs(title="Personal Movement Activity Monitoring Device", 
         subtitle="Average Steps Each Day",
         x="Time Interval ",
         y="Average Steps",
         fill="MONTH") 

  1. Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps? 8:35 a.m.
SummaryIntervalSteps %>%  filter(MeanOfSteps==max(MeanOfSteps)) %>% select(TimeInterval, MeanOfSteps)
## # A tibble: 1 x 2
##   TimeInterval MeanOfSteps
##          <dbl>       <dbl>
## 1          835        206.

Imputing missing values

Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.

  1. Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
  2. Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.
Yeardf <- maindf %>% group_by(`Year`=year(date)) %>%  
        summarize(ISNA_Count = sum(is.na(steps)), Mean_ISNA=mean(is.na(steps)), 
        Total_Obs = length(maindf$date)) %>% ungroup() %>% gt() %>% 
        tab_header(title = "Year 'IS.NA' Summary",
        subtitle = glue("{start_date} to {end_date}"))
Yeardf
Year 'IS.NA' Summary
2012-10-01 to 2012-11-30
Year ISNA_Count Mean_ISNA Total_Obs
2012 2304 0.1311475 17568

Monthly Missing Data

Monthdf <- maindf %>% group_by(M=month(date,label=TRUE, abbr=TRUE)) %>% 
  summarize(Total_Obs = length(date), ISNA_Count = sum(is.na(steps)),
            Mean_ISNA=mean(is.na(steps))) 

MonthdfGT <- Monthdf %>% gt() %>% 
  tab_header(
    title = "Monthly 'IS.NA' Summary",
    subtitle = glue("{start_date} to {end_date}"))
MonthdfGT
Monthly 'IS.NA' Summary
2012-10-01 to 2012-11-30
M Total_Obs ISNA_Count Mean_ISNA
Oct 8928 576 0.06451613
Nov 8640 1728 0.20000000

Weekdays missing data

dailydf <-
    maindf %>%  group_by(
      Daily = wday(date, label = TRUE, abbr = TRUE)) %>%
    summarize(
      Total_Obs = length(date),
      ISNA_Count = sum(is.na(steps)),
      Mean_ISNA = mean(is.na(steps))
  )

  dailydf %>% gt() %>%
    tab_header(title = "By day 'IS.NA' Summary",
               subtitle = glue("{start_date} to {end_date}"))
By day 'IS.NA' Summary
2012-10-01 to 2012-11-30
Daily Total_Obs ISNA_Count Mean_ISNA
Sun 2304 288 0.1250000
Mon 2592 576 0.2222222
Tue 2592 0 0.0000000
Wed 2592 288 0.1111111
Thu 2592 288 0.1111111
Fri 2592 576 0.2222222
Sat 2304 288 0.1250000

Histogram by Weekdays - Tuesdays must be mandatory workday

g3 <- ggplot(data = dailydf, aes(Daily))

g3 + geom_bar(aes(weight = ISNA_Count))+
        labs(title="Personal Movement Activity Monitoring Device", 
         subtitle="Missing data in Week Days",
         x="Daily",
         y="Frequency") 

  1. Create a new dataset that is equal to the original dataset but with the missing data filled in.
# load these library at these section because it affects lubridate labels
# and abbr
library(mice)
library(missForest)
library(VIM)

Using aggr function to plot NA.

aggr(maindf, col=c('navyblue', 'red'),
     numbers=TRUE, sortVars=TRUE,
     labels=names(maindf), cex.axis=.8,
     gap=5, ylab=c("Missing Data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable     Count
##     steps 0.1311475
##      date 0.0000000
##  interval 0.0000000

Strategy for Imputing missing data

I will use the Predictive Mean Matching (PMM) model to impute the missing data. Please review the help file ?pmm for more details.

  1. m = number of multiple imputations. Default is 5 and I picked 3.
  2. maxit = A scalar giving the number of imputations. the default is 5 and I chose 50
  3. Method = pmm
  4. seed = set.seed
imputed_maindf1 <- mice(maindf, m=3, maxit = 50, 
                        method = 'pmm', seed = 420)
## 
##  iter imp variable
##   1   1  steps
##   1   2  steps
##   1   3  steps
##   2   1  steps
##   2   2  steps
##   2   3  steps
##   3   1  steps
##   3   2  steps
##   3   3  steps
##   4   1  steps
##   4   2  steps
##   4   3  steps
##   5   1  steps
##   5   2  steps
##   5   3  steps
##   6   1  steps
##   6   2  steps
##   6   3  steps
##   7   1  steps
##   7   2  steps
##   7   3  steps
##   8   1  steps
##   8   2  steps
##   8   3  steps
##   9   1  steps
##   9   2  steps
##   9   3  steps
##   10   1  steps
##   10   2  steps
##   10   3  steps
##   11   1  steps
##   11   2  steps
##   11   3  steps
##   12   1  steps
##   12   2  steps
##   12   3  steps
##   13   1  steps
##   13   2  steps
##   13   3  steps
##   14   1  steps
##   14   2  steps
##   14   3  steps
##   15   1  steps
##   15   2  steps
##   15   3  steps
##   16   1  steps
##   16   2  steps
##   16   3  steps
##   17   1  steps
##   17   2  steps
##   17   3  steps
##   18   1  steps
##   18   2  steps
##   18   3  steps
##   19   1  steps
##   19   2  steps
##   19   3  steps
##   20   1  steps
##   20   2  steps
##   20   3  steps
##   21   1  steps
##   21   2  steps
##   21   3  steps
##   22   1  steps
##   22   2  steps
##   22   3  steps
##   23   1  steps
##   23   2  steps
##   23   3  steps
##   24   1  steps
##   24   2  steps
##   24   3  steps
##   25   1  steps
##   25   2  steps
##   25   3  steps
##   26   1  steps
##   26   2  steps
##   26   3  steps
##   27   1  steps
##   27   2  steps
##   27   3  steps
##   28   1  steps
##   28   2  steps
##   28   3  steps
##   29   1  steps
##   29   2  steps
##   29   3  steps
##   30   1  steps
##   30   2  steps
##   30   3  steps
##   31   1  steps
##   31   2  steps
##   31   3  steps
##   32   1  steps
##   32   2  steps
##   32   3  steps
##   33   1  steps
##   33   2  steps
##   33   3  steps
##   34   1  steps
##   34   2  steps
##   34   3  steps
##   35   1  steps
##   35   2  steps
##   35   3  steps
##   36   1  steps
##   36   2  steps
##   36   3  steps
##   37   1  steps
##   37   2  steps
##   37   3  steps
##   38   1  steps
##   38   2  steps
##   38   3  steps
##   39   1  steps
##   39   2  steps
##   39   3  steps
##   40   1  steps
##   40   2  steps
##   40   3  steps
##   41   1  steps
##   41   2  steps
##   41   3  steps
##   42   1  steps
##   42   2  steps
##   42   3  steps
##   43   1  steps
##   43   2  steps
##   43   3  steps
##   44   1  steps
##   44   2  steps
##   44   3  steps
##   45   1  steps
##   45   2  steps
##   45   3  steps
##   46   1  steps
##   46   2  steps
##   46   3  steps
##   47   1  steps
##   47   2  steps
##   47   3  steps
##   48   1  steps
##   48   2  steps
##   48   3  steps
##   49   1  steps
##   49   2  steps
##   49   3  steps
##   50   1  steps
##   50   2  steps
##   50   3  steps

Summary of the Imputed Data.

summary(imputed_maindf1)
## Class: mids
## Number of multiple imputations:  3 
## Imputation methods:
##    steps     date interval 
##    "pmm"       ""       "" 
## PredictorMatrix:
##          steps date interval
## steps        0    1        1
## date         1    0        1
## interval     1    1        0

Multiple imputations (m=3)

head(imputed_maindf1$imp$steps)
##   1  2  3
## 1 0 47 47
## 2 0  0  0
## 3 0  0  0
## 4 0  0  0
## 5 0  0  0
## 6 0  0  0

Picking a model from m = 3 to impute the data.

complete_Maindf1 <- complete(imputed_maindf1, 1)

summary(complete_Maindf1)
##      steps             date               interval     
##  Min.   :  0.00   Min.   :2012-10-01   Min.   :   0.0  
##  1st Qu.:  0.00   1st Qu.:2012-10-16   1st Qu.: 588.8  
##  Median :  0.00   Median :2012-10-31   Median :1177.5  
##  Mean   : 37.07   Mean   :2012-10-31   Mean   :1177.5  
##  3rd Qu.: 12.00   3rd Qu.:2012-11-15   3rd Qu.:1766.2  
##  Max.   :806.00   Max.   :2012-11-30   Max.   :2355.0

Histogram Comparison

  1. Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day. Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
CompleteMeanTotal <- complete_Maindf1 %>% group_by(date) %>%  summarize(Steps = sum(steps)) %>% mutate(m=month(date))

g4<-ggplot(data = CompleteMeanTotal, aes(x=Steps,fill=factor(m)))
        
gg4<-g4+geom_histogram(bins = 9, alpha=.5)+geom_vline(
                xintercept = mean(CompleteMeanTotal$Steps))+
        labs(title="Personal Movement Activity Monitoring Device", 
                subtitle="Total Steps Each Day (Imputed dataset), Observation = 61 days",
                x="STEPS",
                y="FREQUENCY",
                fill="MONTH") 

library(cowplot)

plot_grid(gg1, gg4, nrow = 2, labels = "AUTO")

The above histogram shows the imputed dataset’s mean decrease -203.1231, median decrease -326, the steps increase 69042, observation increase by 8 because we imputed the datasets.

Calculating the steps mean and median.

CompleteTotal <-
    complete_Maindf1 %>% drop_na() %>% group_by(date) %>%  
    summarize(Steps = sum(steps)) %>% mutate(m =month(date))

CompleteMeanMed <-
    CompleteTotal %>% summarize(
      Obs = length(date),
      SumofSteps = sum(Steps),
      StepsMean = mean(Steps),
      StepsMedian = median(Steps)
  )

GTCompleteMeanMed <- CompleteMeanMed %>% gt::gt() %>% data_color(
    columns = vars(Obs, SumofSteps, StepsMean, StepsMedian),
      colors = c("yellow")
) %>%
    tab_header(title = "Step Mean and Median Summary (imputed)",
               subtitle = glue("{start_date} to {end_date}"))


GTCompleteMeanMed

Step Mean and Median Summary (imputed)
2012-10-01 to 2012-11-30
Obs SumofSteps StepsMean StepsMedian
61 651215 10675.66 10571
Original data set with NA remove.

GTMeanTotal
Step Mean and Median Summary
2012-10-01 to 2012-11-30
Obs SumofSteps StepsMean StepsMedian
53 570608 10766.19 10765

Comparison formula:

NewSteps <- CompleteMeanMed$SumofSteps - StepsMeanMed$SumofSteps
NewMean <- CompleteMeanMed$StepsMean - StepsMeanMed$StepsMean
NewMed <- CompleteMeanMed$StepsMedian - StepsMeanMed$StepsMedian
NewSteps
## [1] 80607
NewMean
## [1] -90.53294
NewMed
## [1] -194

Are there differences in activity patterns between weekdays and weekends?

For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.

  1. Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
MonFri <- c(2,3,4,5,6)

SatSun <- c(1,7)

New datasets with imputed missing data.

maindf2 <- complete_Maindf1 %>% mutate(DayNumber = day(date))

MonFriDf <- maindf2 %>% filter(DayNumber %in% MonFri)

SatSunDf <- maindf2 %>% filter(DayNumber %in% SatSun)

Timeseries Comparison

  1. Make a panel plot containing a time series plot (i.e. type=“l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis). See the README file in the GitHub repository to see an example of what this plot should look like using simulated data.
TSMonFri <- MonFriDf %>% drop_na() %>% group_by(TimeInterval=interval)%>% 
        summarize(Total_Obs = length(interval), 
        SumOfSteps = sum(steps),
        MeanOfSteps = mean(steps)) 

g5 <-ggplot(data = TSMonFri, aes(y=MeanOfSteps, x=TimeInterval))+geom_line(color="red")+
        labs(title="Personal Movement Activity Monitoring Device", 
             subtitle="Monday through Friday (imputed data)",
             x="STEPS",
             y="FREQUENCY"
      ) 

TSSatSun <- SatSunDf %>% drop_na() %>% group_by(TimeInterval=interval)%>% 
        summarize(Total_Obs = length(interval), 
        SumOfSteps = sum(steps),
        MeanOfSteps = mean(steps)) 

g6 <-ggplot(data = TSSatSun, aes(y=MeanOfSteps, x=TimeInterval))+geom_line(color="green")+
       labs(title="Personal Movement Activity Monitoring Device", 
             subtitle="Saturday-Sunday (imputed data)",
             x="STEPS",
             y="FREQUENCY") 

plot_grid(g5, g6, nrow = 2)