library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
a = read.csv("activity.csv")
a$date = as.Date(as.character(a$date, "%d-%m-%Y"))
a$day = day(a$date)
a$month = month(a$date)
b = aggregate(a$steps ~ a$day + a$month, data = a, FUN = sum, na.rm = TRUE)
png( filename = "question2.png", width = 960, height = 960)
#created a png file in advance so that there is no issue in fitting the entire plot
hist(b$`a$steps`, breaks = seq(0 , 25000, by = 5000), xlab = "steps taken per day", ylab = "frequency", main = "frequency of steps taken every day")
axis(1, at=c(0, max(b$`a$steps`)), labels=c("",""), lwd.ticks=0)
axis(1, at=seq(0 , max(b$`a$steps`), by=5000), lwd=0, lwd.ticks=1)
dev.off()
## png
## 2
library(knitr)
include_graphics("question2.png")
summary(b)
## a$day a$month a$steps
## Min. : 2.00 Min. :10.00 Min. : 41
## 1st Qu.:10.00 1st Qu.:10.00 1st Qu.: 8841
## Median :17.00 Median :10.00 Median :10765
## Mean :16.68 Mean :10.45 Mean :10766
## 3rd Qu.:24.00 3rd Qu.:11.00 3rd Qu.:13294
## Max. :31.00 Max. :11.00 Max. :21194
d = aggregate(a$steps ~ a$interval, data = a, FUN = mean)
png( filename = "question4.png", width = 960, height = 960)
#created a png file in advance so that there is no issue in fitting the entire plot
plot(d$`a$interval`, d$`a$steps`, type = "l", xlab = "interval", ylab = "steps", main = "time series of average steps")
dev.off()
## png
## 2
library(knitr)
include_graphics("question4.png")
d$`a$interval`[d$`a$steps` == max(d$`a$steps`)]
## [1] 835
head(a, 20)
## steps date interval day month
## 1 NA 2012-10-01 0 1 10
## 2 NA 2012-10-01 5 1 10
## 3 NA 2012-10-01 10 1 10
## 4 NA 2012-10-01 15 1 10
## 5 NA 2012-10-01 20 1 10
## 6 NA 2012-10-01 25 1 10
## 7 NA 2012-10-01 30 1 10
## 8 NA 2012-10-01 35 1 10
## 9 NA 2012-10-01 40 1 10
## 10 NA 2012-10-01 45 1 10
## 11 NA 2012-10-01 50 1 10
## 12 NA 2012-10-01 55 1 10
## 13 NA 2012-10-01 100 1 10
## 14 NA 2012-10-01 105 1 10
## 15 NA 2012-10-01 110 1 10
## 16 NA 2012-10-01 115 1 10
## 17 NA 2012-10-01 120 1 10
## 18 NA 2012-10-01 125 1 10
## 19 NA 2012-10-01 130 1 10
## 20 NA 2012-10-01 135 1 10
# as you can see all the "steps" values are NAs
f = a
seq = 1 : length(f$steps)
for (i in seq){
if (is.na(f$steps[i])){
interval1 = f$interval[i]
mean1 = mean(f$steps[f$interval == interval1], na.rm = TRUE)
f$steps[i] = mean1
}
}
head(f, 20)
## steps date interval day month
## 1 1.7169811 2012-10-01 0 1 10
## 2 0.3396226 2012-10-01 5 1 10
## 3 0.1320755 2012-10-01 10 1 10
## 4 0.1509434 2012-10-01 15 1 10
## 5 0.0754717 2012-10-01 20 1 10
## 6 2.0943396 2012-10-01 25 1 10
## 7 0.5283019 2012-10-01 30 1 10
## 8 0.8679245 2012-10-01 35 1 10
## 9 0.0000000 2012-10-01 40 1 10
## 10 1.4716981 2012-10-01 45 1 10
## 11 0.3018868 2012-10-01 50 1 10
## 12 0.1320755 2012-10-01 55 1 10
## 13 0.3207547 2012-10-01 100 1 10
## 14 0.6792453 2012-10-01 105 1 10
## 15 0.1509434 2012-10-01 110 1 10
## 16 0.3396226 2012-10-01 115 1 10
## 17 0.0000000 2012-10-01 120 1 10
## 18 1.1132075 2012-10-01 125 1 10
## 19 1.8301887 2012-10-01 130 1 10
## 20 0.1698113 2012-10-01 135 1 10
# now you can see that the appropriate values have been imputed
g = aggregate(f$steps ~ f$day + f$month, data = f, FUN = sum, na.rm = TRUE)
png( filename = "question7.png", width = 960, height = 960)
#created a png file in advance so that there is no issue in fitting the entire plot
hist(g$`f$steps`, breaks = seq(0 , 25000, by = 5000), xlab = "steps taken per day", ylab = "frequency", main = "frequency of steps taken every day")
axis(1, at=c(0, max(g$`f$steps`)), labels=c("",""), lwd.ticks=0)
axis(1, at=seq(0 , max(g$`f$steps`), by=5000), lwd=0, lwd.ticks=1)
dev.off()
## png
## 2
library(knitr)
include_graphics("question7.png")
# as we can see, the impact of imputing missing data is that the max frequency has increased to 35
f$day.num = wday(f$date)
f$daytype = ifelse(f$day.num == 1 | f$day.num == 7, "weekend", "weekday")
h = aggregate(f$steps ~ f$interval + f$daytype, data = f, FUN = mean)
library(ggplot2)
png( filename = "question8.png", width = 960, height = 960)
#created a png file in advance so that there is no issue in fitting the entire plot
i = ggplot(data = h, mapping = aes(h$`f$interval`, h$`f$steps`))
i + geom_line() + facet_wrap(. ~ h$`f$daytype`) + labs(x = "Intervals", y = "daily average of steps", title = "pattern")
## Warning: Use of `h$`f$interval`` is discouraged. Use `f$interval` instead.
## Warning: Use of `h$`f$steps`` is discouraged. Use `f$steps` instead.
dev.off()
## png
## 2
library(knitr)
include_graphics("question8.png")
# We notice that during weekdays, the number of steps taken is higher in the beginning of the day while during weekends the number of steps are spread more evenly throughout the day