Title: “Reproducible Research: Peer Assignment 1” Author: “Mohamed Rizwan” Date: “March 15, 2019” Output: html_document ##loading libraries

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Downloaded csv file from coursera in the working directory

unzip the file to get “activity.csv” file in the working directory

unzip("repdata_data_activity.zip")

1.reading and preprocessing the data

act <- read.csv("activity.csv")
act$date <- as.Date(act$date)
head(act)

##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

2.Histogram of the total number of steps taken each day

by_date <- act %>% select(date, steps) %>% group_by(date) %>% summarize(totalsteps= sum(steps)) %>% na.omit()
hist(by_date$totalsteps, xlab = "Total number of steps taken each day", main="Histogram of Total number of steps taken each day", breaks = 20)

3.Mean and median of the total number of steps taken per day

summary(by_date)

##       date              totalsteps   
##  Min.   :2012-10-02   Min.   :   41  
##  1st Qu.:2012-10-16   1st Qu.: 8841  
##  Median :2012-10-29   Median :10765  
##  Mean   :2012-10-30   Mean   :10766  
##  3rd Qu.:2012-11-16   3rd Qu.:13294  
##  Max.   :2012-11-29   Max.   :21194

From the summary, Mean = 10766 and Median = 10765

4.Time series plot of the average number of steps taken

library(ggplot2)
by_interval <- act %>% select(interval, steps) %>% na.omit() %>% group_by(interval) %>% summarize(averagesteps= mean(steps))
ggplot(by_interval, aes(x=interval, y=averagesteps))+ geom_line()

5.The 5-minute interval that, on average, contains the maximum number of steps

by_interval[which(by_interval$averagesteps == max(by_interval$averagesteps)),]

## # A tibble: 1 x 2
##   interval averagesteps
##      <int>        <dbl>
## 1      835         206.

6.Code describing for imputing missing data

Total number of missing values, NAs in the dataset

missingvals <- sum(is.na(act))

missingvals

missingvals

## [1] 2304

Replacing the NAs with mean for that 5 minutes interval the day

replacewithmean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
meandata <- act %>% group_by(interval) %>% mutate(steps=replacewithmean(steps))
head(meandata)

## # A tibble: 6 x 3
## # Groups:   interval [6]
##    steps date       interval
##    <dbl> <date>        <int>
## 1 1.72   2012-10-01        0
## 2 0.340  2012-10-01        5
## 3 0.132  2012-10-01       10
## 4 0.151  2012-10-01       15
## 5 0.0755 2012-10-01       20
## 6 2.09   2012-10-01       25

Histogram of the total number of steps taken each day of the imputed dataset

by_date2 <- meandata %>% select(date, steps) %>% group_by(date) %>% summarize(totalsteps= sum(steps))

## Adding missing grouping variables: `interval`

Mean and median of the total number of steps taken per day

summary(by_date2)

##       date              totalsteps   
##  Min.   :2012-10-01   Min.   :   41  
##  1st Qu.:2012-10-16   1st Qu.: 9819  
##  Median :2012-10-31   Median :10766  
##  Mean   :2012-10-31   Mean   :10766  
##  3rd Qu.:2012-11-15   3rd Qu.:12811  
##  Max.   :2012-11-30   Max.   :21194

From the summary, Mean = 10766 and Median = 10766

Histogram of the total number of steps taken each day with the NAs imputed/replaced

hist(by_date2$totalsteps, xlab = "Total number of steps taken each day", main="Histogram of Total number of steps taken each day", breaks = 20)

Difference in the means

mean_1 <- mean(by_date$totalsteps, na.rm = TRUE)
mean_2 <- mean(by_date2$totalsteps)
difference <- mean_2-mean_1

difference

difference

## [1] 0

There’s no difference in the means of the total steps from the previous dataset(by_date) and imputed dataset(by_date2)

6.Differences in activity patterns between weekdays and weekends:

meandata$day <- weekdays(meandata$date)
meandata$weekend <- ifelse(meandata$day=="Saturday" | meandata$day=="Sunday", "Weekend", "Weekday")

Mean of the steps on the weekdays and weekends

mean_weekend_weekday <- aggregate(meandata$steps, by= list(meandata$weekend, meandata$interval), na.omit(mean))
names(mean_weekend_weekday) <- c("weekend", "interval", "steps")
head(mean_weekend_weekday)

##   weekend interval      steps
## 1 Weekday        0 2.25115304
## 2 Weekend        0 0.21462264
## 3 Weekday        5 0.44528302
## 4 Weekend        5 0.04245283
## 5 Weekday       10 0.17316562
## 6 Weekend       10 0.01650943

Panel plot showing the average number of steps in each interval weekdays and weekends

ggplot(mean_weekend_weekday, aes(x= interval, y= steps, color=weekend)) + geom_line()+
    facet_grid(weekend ~.) + xlab("5 min-Interval") + ylab("Average number of the Steps taken") +
    ggtitle("Comparison of average number of steps in each interval")