#load necessary libraries:
library(lubridate)
library(xtable)
library(ggplot2)
library(dplyr)
library(knitr)
Reading in data:
#first set system locals to english, so the days will be in english
Sys.setlocale("LC_ALL","English")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
#data have to be in working directory:
#setwd("C:/DATA/Coursera/Data Science Track/03 Getting and Cleaning Data/")
a<-read.csv("activity.csv")
a_agg<-aggregate(steps ~ date, data=a, sum)
ggplot(a_agg, aes(steps)) + geom_histogram() +
labs(title="Sum of steps taken per day", x="Steps", y="Count")
a_m<-as.data.frame(cbind(mean(a_agg$steps), median(a_agg$steps)))
names(a_m)<-c("Mean", "Median")
print(xtable(a_m), type="html")
| Mean | Median | |
|---|---|---|
| 1 | 10766.19 | 10765.00 |
#create dataset with means of steps of each interval:
d<-filter(a, !is.na(steps)) %>%
group_by(interval) %>%
summarize(AvgSteps = mean(steps))
#plot time series plot:
plot(AvgSteps ~ interval, data=d, type="l",
xlab="5-minute interval", ylab="Mean of steps taken",
main = "The average number of steps taken, averaged across all days")
d$interval[max(d$AvgSteps)]
## [1] 1705
sum(is.na(a$steps))
## [1] 2304
#compute mean for each interval and store in a dataframe - already done:
d
#identify days with NAs:
a_NAs<-group_by(a, date) %>% summarize(NoNAS=sum(is.na(steps)))
a_day<-a_NAs$date[a_NAs$NoNAS > 0]
#after visual inspection I noticed that NAs are only for whole days, no mixed data for any day
#fill in NAs with table d:
f <- a
f$steps[is.na(f$steps)] <- rep(d$AvgSteps, length(a_day))
#look if the values were filled correctly:
#just probe values of first, middle and last day, if all true, then the filling went right and the dataset f is correct
all(d$AvgSteps == f$steps[f$date == a_day[1]])
all(d$AvgSteps == f$steps[f$date == a_day[5]])
all(d$AvgSteps == f$steps[f$date == a_day[8]])
## [1] TRUE
## [1] TRUE
## [1] TRUE
#aggregate and plot data:
f_agg<-aggregate(steps ~ date, f, sum)
ggplot(f_agg, aes(steps)) + geom_histogram()
#calculate mean and median and compare with a_m:
a_m<-rbind(a_m, (c(mean(f_agg$steps), median(f_agg$steps))))
rownames(a_m)<-c("with_NAs", "filled_in")
print(xtable(a_m), type="html")
| Mean | Median | |
|---|---|---|
| with_NAs | 10766.19 | 10765.00 |
| filled_in | 10766.19 | 10766.19 |
1.Create a new factor variable in the dataset with two levels “weekday”" and “weekend”" indicating whether a given date is a weekday or weekend day.
#library(lubridate) to get correct format of dates for weekdays():
f$day<-ymd(f$date)
#extract week days and put them in working dataset:
dny<-weekdays(f$day)
f_day<-f
f_day$dny <- dny
#set days to factor with two levels (weekday and weekend):
f_day$weekday <- "weekday"
f_day$weekday[grepl('^S', f_day$dny)] <- "weekend"
#check that there are only two levels::
table(f_day$weekday)
##
## weekday weekend
## 12960 4608
2.Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).
#calculate mean of steps in interval of weekdays or weekends:
m<- group_by(f_day, weekday, interval) %>%
summarize(meanWeek = mean(steps))
#time series plot:
ggplot(m, aes(x=interval, y=meanWeek, group=factor(weekday))) +
geom_line(color="blue") + facet_grid(weekday~.) +
labs(title="Average number of steps taken,
averaged across all weekday days or weekend days",
x="Interval", y="Number of steps") +
theme_classic(base_size = 14, base_family = "")