(1) Loading and preprocessing the data

download file

library(plyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
file_name <- "repdata_data_activity.zip"
download.file(file_url, file_name, method = "curl")
unzip(file_name)

Read file

activity <- read.csv('./activity.csv', header=TRUE, na.strings="NA")
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

Format date variable to date class and interval variable to factor class

activity$date <- as.Date(activity$date)
activity$interval <- as.factor(activity$interval)

(2) What is mean total number of steps taken per day?

Plot a histogram of total steps taken each day

Total steps taken each day

activity_1 <- ddply(activity, c("date"), summarize,
                     steps  = sum(steps,na.rm = TRUE)
 )

Mean and median of the total number of steps taken each day

steps_mean = mean(activity_1$steps)
steps_median = median(activity_1$steps)

sprintf("Mean number of steps: %s", steps_mean)
## [1] "Mean number of steps: 9354.22950819672"
sprintf("Median number of steps: %s", steps_median)
## [1] "Median number of steps: 10395"
ggplot(activity_1, aes(x=steps)) + 
    geom_histogram(binwidth = 2000, color="blue", fill="white")+
    xlab("Total steps per day") +     ylab("Frequency") +
    ggtitle("Total Number of Steps Taken Each Day") +
    geom_vline(aes(xintercept = steps_mean, color="mean"), size = 0.7) +
    geom_vline(aes(xintercept = steps_median, color="median"), size = 0.7)

(3) What is the average daily activity pattern?

Average steps taken each day

activity_2 <- ddply(activity, c("interval"), summarize,
                     average_steps  = mean(steps,na.rm = TRUE)
 )

Make a time-series plot of the 5-minute interval (x-axis) and the average number of steps taken (y-axis)

x <- activity_2$interval
y <- activity_2$average_steps
plot(x,y, type="l", xlab = "5 Minute Intervals", ylab = "Average Number of Steps Taken", main = "The Average Daily Activity Pattern", col="blue")

# Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?

max_steps <- activity_2$interval[which.max(activity_2$average_steps)]
sprintf("The 5-minute interval with the maximum number of steps on average is: %s", max_steps)
## [1] "The 5-minute interval with the maximum number of steps on average is: 835"

(4) Impute missing values

Calculate and report the total number of missing values (NA)

na_values <- is.na(activity[,1])
sprintf("Number of missing values: %s", sum(na_values))
## [1] "Number of missing values: 2304"

Devise a strategy for filling in all the missing values in the dataset, e.g. use the mean, median for the day ,or the mean for the interval

Using the mean steps for the 5-minute intervals, create new dataframe

activity_3 <- merge(activity, activity_2, by = "interval")

Create a new dataset that is equal to the original dataset but with the missing values filled in with mean steps

activity_3[na_values, "steps"] <- activity_3[na_values, "average_steps"]
new_dataset <- activity_3[,3:2]

activity_4 <- ddply(new_dataset, c("date"), summarize,
                     steps  = sum(steps,na.rm = TRUE)
 )

Make a histogram of the total steps take each day. Calculate the mean and median. Do the values differ from the first part of the assignment?

ggplot(activity_4, aes(x=steps)) + 
    geom_histogram(binwidth = 2000, color="black", fill="white")+
    xlab("Total steps per day") +     
    ylab("Frequency") +
    ggtitle("Total Number of Steps Taken Each Day") +
    geom_vline(aes(xintercept = mean(activity_4$steps), color="mean"), size = 1.0) +
    geom_vline(aes(xintercept = median(activity_4$steps), color="median"), size = 1.0)

sprintf("New mean number of steps: %s", mean(activity_4$steps))
## [1] "New mean number of steps: 9440.51098051345"
sprintf("New median number of steps: %s", median(activity_4$steps))
## [1] "New median number of steps: 10430.5471698113"

Answer: Mean and median steps are similar to the original values. The shape of the histogram is also similar to the original.

(4) Are there differences in activity patterns between weekends and weekdays?

Create a new factor variable in the dataset with two variables - weekday and weekend

activity_3$weekdays <- weekdays(activity_3$date)

activity_5 <- ddply(activity_3, c("interval","weekdays"), summarize,
                     average_steps  = mean(steps,na.rm = TRUE)
 )

Make a panel plot containing a time-series plot of the 5-minute interval (x-axis) indicating whether a given date is a weekday or a weekend (y-axis)

x <- activity_5$weekdays
y <- activity_5$average_steps

qplot(interval, average_steps, data = activity_5, 
    facets = weekdays~., 
    xlab = "5 Minute Intervals", 
    ylab = "Average Number of Steps Taken", 
    main = "The Average Daily Activity Pattern") +
    geom_smooth()
## `geom_smooth()` using method = 'loess'