Import the data
activity <- read.table(file = unz("activity.zip", "activity.csv"),
header = TRUE,
sep = ",",
colClasses = c("integer",
"Date",
"integer"))
Modify the data
## use the dplyr package
suppressMessages(library(dplyr))
## create more important time based fields
activity <- activity %>%
mutate(datetime = as.POSIXct((floor(interval/100)*60 +
interval %% 100)*60,
origin = date,
tz = "UTC"),
minutes = floor(interval/100)*60 + interval %% 100,
time = format(datetime, "%H:%M")) %>%
arrange(date, interval)
First, sum the number of steps taken to the daily level
daily <- activity %>%
group_by(date) %>%
summarise(numSteps = sum(steps, na.rm = TRUE)) %>%
arrange(date)
Next, create a histogram of the total number of steps taken each day
## use the ggplo2 package (scales for axis text)
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
## histogram
ggplot(data = daily,
aes(x = numSteps)) +
geom_histogram(col = "black",
fill = "steelblue",
binwidth = 1000) +
xlab("Total Number of Steps Taken Each Day") +
ylab("Number of Days") +
scale_x_continuous(limits = c(0, 25000),
breaks = seq(0, 25000, 5000),
labels = comma) +
scale_y_continuous(limits = c(0, 10),
breaks = seq(0, 10, 1),
labels = comma)
Finally, report the mean and median number of steps taken per day:
- The mean is 9,354
- The median is 10,395
mean(daily$numSteps)
## [1] 9354.23
median(daily$numSteps)
## [1] 10395
First, average the number of steps taken to the five minute interval level
interval <- activity %>%
group_by(minutes, time) %>%
summarise(avgSteps = mean(steps, na.rm = TRUE)) %>%
arrange(minutes, time)
Next, create a time series plot of the average steps taken by time interval
ggplot(data = interval,
aes(x = minutes,
y = avgSteps)) +
geom_line(col = "black") +
xlab("Time of Day (hh:mm)") +
ylab("Average Number of Steps") +
scale_x_continuous(limits = c(0, 1435),
breaks = c(seq(0, 1400, 120), 1435),
labels = format(as.POSIXct(c(seq(0, 1400, 120), 1435)*60,
origin = as.Date(Sys.time()),
tz = "UTC"),
"%H:%M")) +
scale_y_continuous(limits = c(0, 225),
breaks = seq(0, 225, 25),
labels = comma)
Finally, report the time interval with the maximum number of average steps
## identify observation
t <- interval %>%
inner_join(data.frame(minutes = interval$minutes,
avgRank = dense_rank(interval$avgSteps)),
"minutes") %>%
as.data.frame() %>%
arrange(desc(avgRank)) %>%
filter(row_number() == 1) %>%
select(minutes, time, avgSteps)
## print using xtable
suppressMessages(library(xtable))
print(xtable(t),
type = "html",
include.rownames = FALSE)
| minutes | time | avgSteps |
|---|---|---|
| 515.00 | 08:35 | 206.17 |
First, report the number of rows in the original data with missing values:
- The count is 2,304
sum(is.na(activity$steps))
## [1] 2304
Then, using the average # of steps per interval, create a new dataset filling in the missing values
act <- activity %>%
inner_join(interval, c("minutes", "time")) %>%
mutate(steps = ifelse(is.na(steps),
avgSteps,
steps)) %>%
select(steps, date, interval, datetime, minutes, time)
Next, sum the number of steps taken to the daily level
day <- act %>%
group_by(date) %>%
summarise(numSteps = sum(steps)) %>%
arrange(date)
Now, create a new histogram of the total number of steps taken each day
## histogram
ggplot(data = day,
aes(x = numSteps)) +
geom_histogram(col = "black",
fill = "steelblue",
binwidth = 1000) +
xlab("Total Number of Steps Taken Each Day") +
ylab("Number of Days") +
scale_x_continuous(limits = c(0, 25000),
breaks = seq(0, 25000, 5000),
labels = comma) +
scale_y_continuous(limits = c(0, 10),
breaks = seq(0, 10, 1),
labels = comma)
Finally, report the new mean and median number of steps taken per day:
- The mean is 10,766
- The median is 10,766
mean(day$numSteps)
## [1] 10766.19
median(day$numSteps)
## [1] 10766.19
These new mean and median values do differ from the originals. After imputing, the daily totals become less right skewed and more normally distributed. The average increased by 1,412 steps and the median increased by 371 steps.
Using, the data with imputed values, create factor to distinguish weekdays vs. weekend days
## create new data frame
act2 <- act %>%
mutate(dayType = as.factor(ifelse(as.integer(format(date,
"%u")) %in% (1:5),
"weekday",
"weekend")))
## quickly see counts by day type
summary(act2$dayType)
## weekday weekend
## 12960 4608
Then, average the number of steps taken to the day type / five minute interval level
int <- act2 %>%
group_by(dayType, minutes, time) %>%
summarise(avgSteps = mean(steps, na.rm = TRUE)) %>%
arrange(dayType, minutes, time)
Next, create a panel (faceted by day type) time series plot of the average steps taken by time interval
ggplot(data = int,
aes(x = minutes,
y = avgSteps)) +
geom_line(col = "black") +
xlab("Time of Day (hh:mm)") +
ylab("Average Number of Steps") +
scale_x_continuous(limits = c(0, 1435),
breaks = c(seq(0, 1400, 120), 1435),
labels = format(as.POSIXct(c(seq(0, 1400, 120), 1435)*60,
origin = as.Date(Sys.time()),
tz = "UTC"),
"%H:%M")) +
scale_y_continuous(limits = c(0, 250),
breaks = seq(0, 250, 50),
labels = comma) +
facet_wrap(~ dayType, ncol = 1)