library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
raw = read.csv("activity.csv")
head(raw)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
For this part of the assignment, we ignored the missing values in the dataset.
by_date = group_by(raw, date)
ave_step = summarise(by_date, dailySum = sum(steps, na.rm = TRUE))
head(ave_step)
## # A tibble: 6 x 2
## date dailySum
## <fct> <int>
## 1 2012-10-01 0
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
qplot(dailySum, data = ave_step, xlab = "Number of Daily Steps",
ylab = "Numer of Days", geom = "histogram",fill = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
stepData = ave_step$dailySum
MEAN = mean(stepData, na.rm = TRUE)
MEDIAN = median(stepData, na.rm = TRUE)
rbind(c("Mean of Daily Steps",MEAN), c("Median of Daily Steps", MEDIAN))
## [,1] [,2]
## [1,] "Mean of Daily Steps" "9354.22950819672"
## [2,] "Median of Daily Steps" "10395"
by_interval = group_by(raw[,c(1,3)], interval)
ave_stepByInterval = summarise(by_interval, intervalMean = mean(steps, na.rm = TRUE))
ggplot(data = ave_stepByInterval, aes(x=interval, y=intervalMean)) + geom_line() + xlab("5-minute interval") + ylab("average number of steps taken")
max_step_interval = ave_stepByInterval[which.max(ave_stepByInterval$intervalMean),]
max_step_interval
## # A tibble: 1 x 2
## interval intervalMean
## <int> <dbl>
## 1 835 206.
Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
First of all, by using is.na, we know that there are no missing values in date columen and interval column of dataset raw
sum(is.na(raw$date)) + sum(is.na(raw$interval))
## [1] 0
Missing values are lying in column steps. That is also the total number of missing values in the dataset (i.e. the total number of rows with NAs)
sum(is.na(raw$steps))
## [1] 2304
First, we calculate the mean number of steps of a interval every day. In case that there are several days when no data are recorded whole day, we put 0
by_Date = summarise(group_by(raw[,1:2], date), Intervalmean=mean(steps,na.rm = TRUE))
by_Date[is.nan(by_Date$Intervalmean),2]=0
by_Date
## # A tibble: 61 x 2
## date Intervalmean
## <fct> <dbl>
## 1 2012-10-01 0
## 2 2012-10-02 0.438
## 3 2012-10-03 39.4
## 4 2012-10-04 42.1
## 5 2012-10-05 46.2
## 6 2012-10-06 53.5
## 7 2012-10-07 38.2
## 8 2012-10-08 0
## 9 2012-10-09 44.5
## 10 2012-10-10 34.4
## # ... with 51 more rows
Then, clone raw dataset to NewRaw, which is going to store the cleaned dataset (without NA) After that, check the cells in steps colume whether it is NA. If TRUE, extract the date of that cell, and replace the NA value by the average interval steps on that day.
See that the NA values of the first 6 rows are replaced with 0
NewRaw = raw
for (i in 1:length(raw$steps))
{
if (is.na(raw[i,1]))
{
day = raw[i,2]
match = by_Date[day,2]
NewRaw[i,1] = match
}
}
head(NewRaw)
## steps date interval
## 1 0 2012-10-01 0
## 2 0 2012-10-01 5
## 3 0 2012-10-01 10
## 4 0 2012-10-01 15
## 5 0 2012-10-01 20
## 6 0 2012-10-01 25
Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
by_date2 = group_by(NewRaw, date)
ave_step2 = summarise(by_date2, dailySum = sum(steps))
qplot(dailySum, data=ave_step2, xlab = "Number of Daily Steps", ylab = "Numer of Days", binwidth=1000)
Calculate the mean and median total number of steps taken per day.
stepData2 = ave_step2$dailySum
MEAN2 = mean(stepData2)
MEDIAN2 = median(stepData2)
rbind(c("Mean of Daily Steps",MEAN2), c("Median of Daily Steps", MEDIAN2))
## [,1] [,2]
## [1,] "Mean of Daily Steps" "9354.22950819672"
## [2,] "Median of Daily Steps" "10395"
Firstly, we add a colume WeekDay to the raw dataset, stored it in rawNwkd. Then subset weekday data from rawNwkd, group it according to the interval. Then calculate the mean corresponding steps across all days. Add an indicator column WD in the end Then do the same to the weekend. Next, combine two dataset and plot the picture
rawNwkd=mutate(NewRaw, WeekDay= weekdays(as.Date(date)))
WEEKDAY = subset(rawNwkd, WeekDay %in% c("Monday","Tuesday","Wednesday","Thursday","Friday"))
WEEKDAY2 = group_by(WEEKDAY[,c(1,3)], interval)
WEEKDAY3 = summarise(WEEKDAY2, ave= mean(steps))
WEEKDAY3 = mutate(WEEKDAY3, WD = 1)
WEEKEND = subset(rawNwkd, WeekDay %in% c("Saturday","Sunday"))
WEEKEND2 = group_by(WEEKEND[,c(1,3)], interval)
WEEKEND3 = summarise(WEEKEND2, ave= mean(steps))
WEEKEND3 = mutate(WEEKEND3, WD = 0)
combined = rbind(WEEKDAY3,WEEKEND3)
ggplot(data = combined, aes(x=interval,y=ave)) + geom_line(colour="#FF9999") +
facet_grid(rows = vars(WD))