Read the necessary packages

library(data.table)
library(dplyr)
library(ggplot2)

Loading and preprocessing the data

Define path directory
filePath<- getwd()
look at the name of the data set (zipped) in the directory
list.files(filePath) 
Unzip the zipped loaded data
unzip("activity.zip")
Look at the directory to check the unzipped data file
list.files(filePath) 
Read the csv data and assign the data set name activity
activity<- read.csv("activity.csv")
Explore the data and get some overview
summary(activity)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 12.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##  NA's   :2304     (Other)   :15840
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
tail(activity)
##       steps       date interval
## 17563    NA 2012-11-30     2330
## 17564    NA 2012-11-30     2335
## 17565    NA 2012-11-30     2340
## 17566    NA 2012-11-30     2345
## 17567    NA 2012-11-30     2350
## 17568    NA 2012-11-30     2355

What is mean total number of steps taken per day?

1.Calculate the total number of steps taken per day

DailySteps<- aggregate(activity$steps, by = list(activity$date), sum)
colnames(DailySteps)<- c("Date", "Total_Steps")
head(DailySteps)
##         Date Total_Steps
## 1 2012-10-01          NA
## 2 2012-10-02         126
## 3 2012-10-03       11352
## 4 2012-10-04       12116
## 5 2012-10-05       13294
## 6 2012-10-06       15420

2.Make a histogram of the total number of steps taken each day

Non_NAData <- activity[complete.cases(activity), ]

ggplot(Non_NAData, aes(as.factor(date),steps))+geom_bar(fill="orange", stat="identity")+xlab("Dates") + ylab("Total Steps")+ggtitle("Histogram of total steps taken each day")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

3. Calculate & report the mean & median of the total number of steps taken per day

Non_NADailySteps<- aggregate(steps~date,Non_NAData, FUN=sum)
mean(Non_NADailySteps$steps)
## [1] 10766.19
median(Non_NADailySteps$steps)
## [1] 10765

What is the average daily activity pattern?

1. Make a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)

StepsPerInterval <- aggregate(steps~interval, Non_NAData, FUN=mean)

plot(StepsPerInterval$steps ~ StepsPerInterval$interval, type="l", xlab="Time Intervals (5 minutes)", ylab="Total Steps", main = "Average Number of Steps Taken at 5 minutes Interval")

2. Which 5-minute interval, on average across all the days in the dataset,contains the maximum number of steps?

StepsPerInterval[which(StepsPerInterval$steps == max(StepsPerInterval$steps)), ]
##     interval    steps
## 104      835 206.1698

Imputing missing values

1. Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)

sum(is.na(activity$steps)) 
## [1] 2304

2. Devise a strategy for filling in all of the missing values in the dataset.The strategy does not need to be sophisticated. For example,you could use the mean/median for that day, or the mean for that 5-minute interval, etc.

I will use the mean for that 5 -minute interval to replace all the missing values and check if all the NAs have been replaced. The code for this strategy is as follows:
NA_Data <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
NA_Replaced <- (activity %>% group_by(interval) %>% mutate(steps = NA_Data(steps)))
summary(NA_Replaced)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 27.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##                   (Other)   :15840
check whether there is NA or not
sum(is.na(NA_Replaced))
## [1] 0

3. Create a new dataset that is equal to the original dataset but with the missing data filled in.

NAFilledActivity <- as.data.frame(NA_Replaced)
dim(NAFilledActivity)
## [1] 17568     3
str(NAFilledActivity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : num  1.717 0.3396 0.1321 0.1509 0.0755 ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
summary(NAFilledActivity)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 27.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##                   (Other)   :15840
head(NAFilledActivity)
##       steps       date interval
## 1 1.7169811 2012-10-01        0
## 2 0.3396226 2012-10-01        5
## 3 0.1320755 2012-10-01       10
## 4 0.1509434 2012-10-01       15
## 5 0.0754717 2012-10-01       20
## 6 2.0943396 2012-10-01       25
tail(NAFilledActivity)
##           steps       date interval
## 17563 2.6037736 2012-11-30     2330
## 17564 4.6981132 2012-11-30     2335
## 17565 3.3018868 2012-11-30     2340
## 17566 0.6415094 2012-11-30     2345
## 17567 0.2264151 2012-11-30     2350
## 17568 1.0754717 2012-11-30     2355

4. Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.

Do these values differ from the estimates from the first part of the assignment?

What is the impact of imputing missing data on the estimates of the total daily number of steps?

Histogram of the total number of steps taken each day

NAFilledDailySteps<- aggregate(steps ~ date, NAFilledActivity, FUN = sum)

ggplot(NAFilledDailySteps, aes(as.factor(date),steps))+geom_bar(fill="purple", stat="identity")+xlab("Date") + ylab("Total Steps")+ggtitle("Histogram of total steps taken each day after filling NAs")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

Mean comparisons

NewMean<- mean(NAFilledDailySteps$steps) 
OldMean<- mean(Non_NADailySteps$steps)
NewMean
## [1] 10766.19
OldMean
## [1] 10766.19
The means of each datasets are same

Median comparisons

NewMedian<- median(NAFilledDailySteps$steps)
OldMedian<- median(Non_NADailySteps$steps)
NewMedian
## [1] 10766.19
OldMedian
## [1] 10765
The new median after filling the NAs values is one step higher than the previos mean with NAs

Are there differences in activity patterns between weekdays and weekends?

1. Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

NAFilledActivity$DayType <- ifelse(as.POSIXlt(NAFilledActivity$date)$wday %in% c(0,6), 'weekend', 'weekday')
AvgNAFilledActivity <- aggregate(steps ~ DayType +interval , data=NAFilledActivity, mean)
head(AvgNAFilledActivity)
##   DayType interval      steps
## 1 weekday        0 2.25115304
## 2 weekend        0 0.21462264
## 3 weekday        5 0.44528302
## 4 weekend        5 0.04245283
## 5 weekday       10 0.17316562
## 6 weekend       10 0.01650943

2. Make a panel plot containing a time series plot (i.e.type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days(y-axis).

ggplot(AvgNAFilledActivity, aes(interval, steps, color = DayType)) + 
    geom_line(lwd = 1) + 
    facet_wrap(~DayType, ncol=1) +
    xlab("5 minutes interval") + 
    ylab("Number of steps") +
    ggtitle("Day Type Comparison of Steps in 5 minutes Interval")