library(knitr)
library(ggplot2)
library(data.table)
opts_chunk$set(echo = TRUE, results = 'hold')
activity <- unzip("activity.zip")
act_data <- read.csv("activity.csv", header=TRUE, sep=",")
str(act_data)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Convert some of the vectors to appropriate forms
act_data$date <- as.Date(act_data$date, format="%Y-%m-%d")
act_data$interval <- as.factor(act_data$interval)
steps_per_day <- aggregate(steps ~ date, data=act_data, FUN=sum)
colnames(steps_per_day) <- c("date", "steps")
ggplot(steps_per_day, aes(x = steps)) +
geom_histogram(fill = "green", binwidth = 1000) +
labs(title = "Histogram - Steps Taken Per Day", x = "Steps Per Day", y = "Frequency")
mean_steps_per_day <- mean(steps_per_day$steps)
mean_steps_per_day
median_steps_per_day <- median(steps_per_day$steps)
median_steps_per_day
## [1] 10766.19
## [1] 10765
steps_per_interval <- aggregate(steps ~ interval, data = act_data, FUN = mean, na.rm = TRUE)
steps_per_interval$interval <- as.integer(levels(steps_per_interval$interval)[steps_per_interval$interval])
colnames(steps_per_interval) <- c("interval", "steps")
Plot the timeseries graph
ggplot(steps_per_interval, aes(x = interval, y = steps)) +
geom_line(col = "green", size = 1) +
labs(title = "Average Daily Activity Pattern", x = "Interval", y = "Steps")
max_interval <- steps_per_interval[which.max(steps_per_interval$steps),]
max_interval
## interval steps
## 104 835 206.1698
missing_values <- sum(is.na(act_data$steps))
missing_values
## [1] 2304
new_act_data <- act_data
index_of_na <- which(is.na(new_act_data$steps))
for (i in index_of_na) {
new_act_data$steps[i] <- with(steps_per_interval, steps[interval = new_act_data$interval[i]])
}
let us check if the above strategy really worked out:
new_missing_values <- sum(is.na(new_act_data$steps))
new_missing_values
## [1] 0
new_steps_per_day <- aggregate(steps ~ date, data = new_act_data, FUN=sum)
colnames(new_steps_per_day) <- c("date", "steps")
ggplot(new_steps_per_day, aes(x = steps)) +
geom_histogram(fill = "green", binwidth = 1000) +
labs(title = "Histogram - Steps Taken Per Day", x = "Steps Per Day", y = "Frequency")
In order to find the impact of imputing the missing values, let us compute the mean and median of steps taken per day
new_mean_steps_per_day <- mean(new_steps_per_day$steps)
new_mean_steps_per_day
new_median_steps_per_day <- median(new_steps_per_day$steps)
new_median_steps_per_day
## [1] 10766.19
## [1] 10766.19
As we can see both mean and median of steps taken per day became exactly same which was not the case prior to imputing missing values.
dt <- data.table(new_act_data)
dt[, weekday := ifelse(weekdays(date) %in% c("Saturday", "Sunday"), "Weekend", "Weekday")]
dt$weekday <- as.factor(dt$weekday)
dt$interval <- as.integer(levels(dt$interval)[dt$interval])
head(dt, 10)
## steps date interval weekday
## 1: 1.7169811 2012-10-01 0 Weekday
## 2: 0.3396226 2012-10-01 5 Weekday
## 3: 0.1320755 2012-10-01 10 Weekday
## 4: 0.1509434 2012-10-01 15 Weekday
## 5: 0.0754717 2012-10-01 20 Weekday
## 6: 2.0943396 2012-10-01 25 Weekday
## 7: 0.5283019 2012-10-01 30 Weekday
## 8: 0.8679245 2012-10-01 35 Weekday
## 9: 0.0000000 2012-10-01 40 Weekday
## 10: 1.4716981 2012-10-01 45 Weekday
steps_per_weekday <- aggregate(steps ~ interval+weekday, data = dt, FUN = mean)
ggplot(steps_per_weekday, aes(x = interval, y = steps)) +
geom_line(col = "green", size = 1) +
facet_wrap(~ weekday, nrow=2, ncol=1) +
labs(x = "Interval", y = "Number of Steps")
Looking at the above graph we notice that the activity on weekdays has the highest peak (> 300) compared to all intervals and only one other peak that touches 100. On the contrary, weekend intervals have more peaks over a hundred than weekday. May be the person from whomever the data is collected is engaged in more active life style during weekends compared to weekdays.