Reproducible Research: Peer Assessment 1 by Kuowei Mu

1.Loading and preprocessing the data

#Download zip file & unzip
zip=download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip", 
                  destfile="raw_data.zip",method="curl")
files=unzip("raw_data.zip")

#Load data and assign it to variable "data"
data=read.csv(files)

#Clean data
data$date=as.Date(data$date)
data1=tapply(data$steps, data$date, sum)
data2=data.frame(date=as.Date(attributes(data1)$dimnames[[1]]), steps=data1)
data3=data.frame(interval=unique(data$interval), steps=tapply(data$steps, data$interval, mean, na.rm=T))

2.What is mean total number of steps taken per day?

#1.Make a histogram of the total number of steps taken each day
library(ggplot2)
ggplot(data2, aes(x=date, y=steps))+
        geom_bar(stat="identity",color="blue", fill="blue")+
        ggtitle("Daily Steps Taken")+
        theme(plot.title = element_text(lineheight=.8, face="bold"))

plot of chunk unnamed-chunk-2

#Calculate the mean and median total number of steps taken per day
mean_steps=as.integer(mean(data2$steps, na.rm=T))
median_steps=median(data2$steps, na.rm=T)

*The mean steps taken per day is 10766 steps; and the median steps taken per day is 10765 steps.

3.What is the average daily activity pattern?

#Make a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
ggplot(data3, aes(x=interval, y=steps))+
        geom_line()+
        geom_vline(xintercept=data3$interval[data3$steps==max(data3$steps)], color="red", linetype="longdash")+
        ggtitle("Average Daily Activity Pattern by 5-minute Interval")+
        annotate("text", x = 1245, y = max(data3$steps), label = "Max Avg. Daily Activity")

plot of chunk unnamed-chunk-3

#Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
max_steps_interval=data3$interval[data3$steps==max(data3$steps)]

*The 835th interval contains the maximum number of average steps across all the days in the dataset.

4.Imputing missing values

#1. Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)

missing_values=sum(is.na(data$steps))

*There are 2304 missing values in the dataset.

#Create a new dataset that is equal to the original dataset but with the missing data filled in with the mean value of the same interval
interval_mean=tapply(data$steps, data$interval, mean, na.rm=T)
data4=data
for (i in (1:nrow(data4))){
        if(is.na(data4$steps[i])){
                data4$steps[i]=floor(interval_mean[attributes(interval_mean)$dimnames[[1]]==data4$interval[i]])
        }
}

#Original Dataset with NAs
head(data)

##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

#New Dataset with NA filled out with the mean steps from the same interval.
head(data4)

##   steps       date interval
## 1     1 2012-10-01        0
## 2     0 2012-10-01        5
## 3     0 2012-10-01       10
## 4     0 2012-10-01       15
## 5     0 2012-10-01       20
## 6     2 2012-10-01       25

#Histogram and the new mean & mean from the new dataset
require(gridExtra)

## Loading required package: gridExtra
## Loading required package: grid

original_plot=ggplot(data2, aes(x=date, y=steps))+
        geom_bar(stat="identity",color="blue", fill="blue")+
        ggtitle("Daily Steps Taken (Original Dataset)")+
        theme(plot.title = element_text(lineheight=.8, face="bold"))
new_plot=ggplot(data4, aes(x=date, y=steps))+
        geom_bar(stat="identity",color="purple", fill="purple")+
        ggtitle("Daily Steps Taken (New Dataset)")+
        theme(plot.title = element_text(lineheight=.8, face="bold"))

grid.arrange(original_plot, new_plot, ncol=2)

## Warning: Removed 8 rows containing missing values (position_stack).

plot of chunk unnamed-chunk-8

original_mean_steps=as.integer(mean(data2$steps, na.rm=T))
original_median_steps=median(data2$steps, na.rm=T)
new_mean_steps=as.integer(mean(data4$steps, na.rm=T))
new_median_steps=median(data4$steps, na.rm=T)

*Original dataset mean: 10766

*New dataset mean: 10766

*Original dataset median: 10765

*New dataset median: 10765

5.Are there differences in activity patterns between weekdays and weekends?

#Create a new factor variable with two levels “weekday” and “weekend” indicating whether a given date is a weekday or weekend day and merge with the new dataset
weekdays=weekdays(data4$date)
weekdays=as.factor(ifelse(weekdays %in% c("Saturday","Sunday"), "weekend", "weekday"))
data4=cbind(data4, weekdays)

#Creating plots
ggplot(data4, aes(x=interval, y=steps))+geom_line(color="blue")+ylab("Number of steps")+facet_wrap(~weekdays, ncol=1)

plot of chunk unnamed-chunk-10

Yes, there are differences in activity patterns between weekdays and weekends–activity during weekdays starts earlier than weekends in early morning, from interval 500 and remained higher on average than weekends.