Title: Assignment one for Reproducible Research

NOTE:Before executing the code, please download the file from Coursera website: Activity monitoring data

Step 1: Loading and preprocessing the data

unziprepdata<-unzip("repdata-data-activity.zip")
repdata<-read.csv(unziprepdata, head=T, na.strings="NA")
head(repdata)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
str(repdata)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

Step 2: What is the mean total number of steps taken per day?

2.1: Calculate the total number of steps taken per day

totalStepPerDay<-tapply(repdata$steps, repdata$date, sum)
totalStepPerDay
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 
##         NA        126      11352      12116      13294      15420 
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12 
##      11015         NA      12811       9900      10304      17382 
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18 
##      12426      15098      10139      15084      13452      10056 
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24 
##      11829      10395       8821      13460       8918       8355 
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30 
##       2492       6778      10119      11458       5018       9819 
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05 
##      15414         NA      10600      10571         NA      10439 
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11 
##       8334      12883       3219         NA         NA      12608 
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17 
##      10765       7336         NA         41       5441      14339 
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 
##      15110       8841       4472      12787      20427      21194 
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29 
##      14478      11834      11162      13646      10183       7047 
## 2012-11-30 
##         NA

2.2: Make a histogram of the total number of steps taken each day

hist(totalStepPerDay, col="blue",main="Total number of steps")

2.3: Calculate the mean and median of the total number of steps taken per day

## Mean of the total number of steps taken per day 
mean(totalStepPerDay, na.rm=TRUE)
## [1] 10766.19
## Median of the total number of steps taken per day 
median(totalStepPerDay, na.rm=TRUE)
## [1] 10765

step 3: What is the average daily activity pattern?

3.1: A time series plot

interval<-unique(repdata$interval)
avgstepsbyinterval<-sapply(split(repdata$steps, repdata$interval), na.rm=TRUE, mean)
plot(interval, avgstepsbyinterval, type="l", xlab="interval", ylab="Average Steps by Interval")

3.2: Interval has maximal avg of numbers of steps

newdata<-data.frame(interval, avgstepsbyinterval)
colnames(newdata)<-c("interval", "avgsteps")
newdata<-transform(newdata, interval=factor(interval))
intervalwithmaxvagsteps<-subset(newdata, avgsteps==max(avgstepsbyinterval))
intervalwithmaxvagsteps
##     interval avgsteps
## 835      835 206.1698

step4: Imputing missing values

4.1: Calculate and report the total number of missing values in the dataset

#Total number of missing values
good<-complete.cases(repdata)
sum(!good)
## [1] 2304
#General summary of missing values 
table(!good)
## 
## FALSE  TRUE 
## 15264  2304
#Summary for each column with missing values
colSums(is.na(repdata))
##    steps     date interval 
##     2304        0        0

4.2: Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.

#A function to replace NA value with the mean for that interval
newrepdata<-data.frame()
for (i in interval){
   dat<-subset(repdata, repdata$interval==i)
   substeps <-subset(repdata$steps,repdata$interval==i)
   dat[is.na(dat)] <- mean(substeps, na.rm=TRUE)
   newrepdata<-rbind(newrepdata, dat)
}

4.3: Create a new dataset that is equal to the original dataset but with the missing data filled in.

## rearrange the data according to date and interval
# Load "dplyr" package
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
newrepdata<-arrange(newrepdata, date, interval)
## Compare the new and old datasets
summary(newrepdata)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 27.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##                   (Other)   :15840
summary(repdata)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 12.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##  NA's   :2304     (Other)   :15840

4.4: Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day. Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?

## Make a histogram of the total number of steps taken each day
newtotalStepPerDay<-tapply(newrepdata$steps, newrepdata$date, sum)
hist(newtotalStepPerDay, col="blue", xlab= "Total Steps Per Day", main="Summary of Total Steps")

## Mean of the total number of steps taken per day 
mean(newtotalStepPerDay, na.rm=TRUE)
## [1] 10766.19
## Median of the total number of steps taken per day 
median(newtotalStepPerDay, na.rm=TRUE)
## [1] 10766.19

Step5: Comparison of activity patterns between weekdays and weekends

## Formate date
dt<-as.Date(as.vector(newrepdata$date), "%Y-%m-%d")
## Create a new factor variable in the dataset with two levels - "weekday" and "weekend"
wkd<-c('Saturday', 'Sunday')
wd<-factor((weekdays(dt) %in% wkd), 
           levels=c(FALSE, TRUE), labels= c("Weekday", "Weekend"))
## Add one variable into the dataset
newrepdata$weekday<-wd
## Remove the "date" variable
newrepdata<-newrepdata[,-2]
## Summarize data by "weekday" and "interval" variables
grp_cols<-names(newrepdata)[-1]
dots<-lapply(grp_cols, as.symbol)
sumdata<-newrepdata %>% group_by_(.dots=dots) %>% summarize(avgstep=mean(steps))
## transform "weekday" variable to factor
sumdata<-transform(sumdata, weekday=factor(weekday))
## check summarized data 
head(sumdata)
##   interval weekday    avgstep
## 1        0 Weekday 2.25115304
## 2        0 Weekend 0.21462264
## 3        5 Weekday 0.44528302
## 4        5 Weekend 0.04245283
## 5       10 Weekday 0.17316562
## 6       10 Weekend 0.01650943
## Plot data with lattice package
library(lattice)

xyplot(avgstep ~ interval|weekday,  data=sumdata, type = "l", layout=c(1, 2),
       xlab="Interval", ylab="Average Steps by Interval", main = "Activity Patterns")