library(plyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
file_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
file_name <- "repdata_data_activity.zip"
download.file(file_url, file_name, method = "curl")
unzip(file_name)
activity <- read.csv('./activity.csv', header=TRUE, na.strings="NA")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
activity$date <- as.Date(activity$date)
activity$interval <- as.factor(activity$interval)
activity_1 <- ddply(activity, c("date"), summarize,
steps = sum(steps,na.rm = TRUE)
)
steps_mean = mean(activity_1$steps)
steps_median = median(activity_1$steps)
sprintf("Mean number of steps: %s", steps_mean)
## [1] "Mean number of steps: 9354.22950819672"
sprintf("Median number of steps: %s", steps_median)
## [1] "Median number of steps: 10395"
ggplot(activity_1, aes(x=steps)) +
geom_histogram(binwidth = 2000, color="blue", fill="white")+
xlab("Total steps per day") + ylab("Frequency") +
ggtitle("Total Number of Steps Taken Each Day") +
geom_vline(aes(xintercept = steps_mean, color="mean"), size = 0.7) +
geom_vline(aes(xintercept = steps_median, color="median"), size = 0.7)
activity_2 <- ddply(activity, c("interval"), summarize,
average_steps = mean(steps,na.rm = TRUE)
)
x <- activity_2$interval
y <- activity_2$average_steps
plot(x,y, type="l", xlab = "5 Minute Intervals", ylab = "Average Number of Steps Taken", main = "The Average Daily Activity Pattern", col="blue")
# Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
max_steps <- activity_2$interval[which.max(activity_2$average_steps)]
sprintf("The 5-minute interval with the maximum number of steps on average is: %s", max_steps)
## [1] "The 5-minute interval with the maximum number of steps on average is: 835"
na_values <- is.na(activity[,1])
sprintf("Number of missing values: %s", sum(na_values))
## [1] "Number of missing values: 2304"
activity_3 <- merge(activity, activity_2, by = "interval")
activity_3[na_values, "steps"] <- activity_3[na_values, "average_steps"]
new_dataset <- activity_3[,3:2]
activity_4 <- ddply(new_dataset, c("date"), summarize,
steps = sum(steps,na.rm = TRUE)
)
ggplot(activity_4, aes(x=steps)) +
geom_histogram(binwidth = 2000, color="black", fill="white")+
xlab("Total steps per day") +
ylab("Frequency") +
ggtitle("Total Number of Steps Taken Each Day") +
geom_vline(aes(xintercept = mean(activity_4$steps), color="mean"), size = 1.0) +
geom_vline(aes(xintercept = median(activity_4$steps), color="median"), size = 1.0)
sprintf("New mean number of steps: %s", mean(activity_4$steps))
## [1] "New mean number of steps: 9440.51098051345"
sprintf("New median number of steps: %s", median(activity_4$steps))
## [1] "New median number of steps: 10430.5471698113"
activity_3$weekdays <- weekdays(activity_3$date)
activity_5 <- ddply(activity_3, c("interval","weekdays"), summarize,
average_steps = mean(steps,na.rm = TRUE)
)
x <- activity_5$weekdays
y <- activity_5$average_steps
qplot(interval, average_steps, data = activity_5,
facets = weekdays~.,
xlab = "5 Minute Intervals",
ylab = "Average Number of Steps Taken",
main = "The Average Daily Activity Pattern") +
geom_smooth()
## `geom_smooth()` using method = 'loess'