url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(url, "activity.zip")
unzip("activity.zip")
df <- read.csv("activity.csv")
str(df)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
head(df)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
Most users have total daily steps between 10000 and 15000. The average total steps is 10766 and the median is 10765.
df_date <- df %>% group_by(date) %>% summarize(total_steps = sum(steps))
head(df_date)
## # A tibble: 6 × 2
## date total_steps
## <chr> <int>
## 1 2012-10-01 NA
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
hist(df_date$total_steps)
summary(df_date$total_steps, na.rm=TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 41 8841 10765 10766 13294 21194 8
Find out which 5-minute interval contains the maximum number of steps The 835 interval with 206 steps
avg_df <- df %>% group_by(interval) %>% summarize(avg_steps = mean(steps, na.rm = TRUE))
avg_df[which.max(avg_df$avg_steps),]
## # A tibble: 1 × 2
## interval avg_steps
## <int> <dbl>
## 1 835 206.
ggplot(data = avg_df,
aes(x = interval, y = avg_steps)) + geom_point(colour = "blue") + geom_line(data = avg_df,
aes(x = interval, y = avg_steps), colour = "red")
Replace NAs by average value of steps and make a new histagram with new dataset
df_new <- replace(df, is.na(df), "37.38")
sum(is.na(df_new))
## [1] 0
df_new$steps <- as.numeric(df_new$steps)
df_new_by_date <- df_new %>% group_by(date) %>% summarize(total_steps = sum(steps))
df_new_by_date$total_steps<- as.numeric(df_new_by_date$total_steps)
hist(df_new_by_date$total_steps)
Add another filed to define weekday and weekend and plot by two facets to compare patterns
df_new$date <- as.Date(df$date)
weekdays1 <- c('Monday', 'Tuesday','Wedesday','Thursday','Friday')
df_new$wDay <- factor((weekdays(df_new$date) %in% weekdays1), levels = c(FALSE, TRUE), labels = c('weekend','weekday'))
## another method
##library(timeDate) isWeekday(df1$date, wday=1:5)
ggplot(data = df_new, aes( x = interval, y = steps )) + geom_point()+facet_wrap(~wDay)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.