rm(list=ls())
if(!file.exists("activity.csv")) {
tempfile <- tempfile()
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip")
unzip(tempfile)
unlink(tempfile)
}
activity <- read.csv("activity.csv")
We use the aggregate function and we remove NAs as below:
activity_steps_day <- aggregate(steps ~ date, data = activity, FUN = sum, na.rm = TRUE)
hist(activity_steps_day$steps, xlab = "Number of steps per day", main = "Total number of steps per day",col="grey")
mean_steps <- mean(activity_steps_day$steps)
median_steps <- median(activity_steps_day$steps)
mean_steps <- format(mean_steps,digits = 1)
median_steps <- format(median_steps,digits = 1)
We use the aggregate function and we remove NAs as below:
activity_average_daily <- aggregate(steps ~ interval, data = activity, FUN = mean, na.rm = TRUE)
We get the plot as below:
plot(activity_average_daily$interval, activity_average_daily$steps, type = "l", col = "grey", xlab = "Intervals",
ylab = "Total number of steps per interval", main = "Average number of steps taken, averaged across all days")
maximum_steps <- max(activity_average_daily$steps)
maximum_interval <- activity_average_daily$interval[which(activity_average_daily$steps == maximum_steps)]
maximum_steps <- format(maximum_steps, digits = 1)
mv <- sum(is.na(activity))
We plot the number of missing values per interval and per day to understand which method we should consider.
missing_values <- subset(activity, is.na(steps))
par(mfrow = c(2,1), mar = c(2, 2, 1, 1))
hist(missing_values$interval, main = "NAs repartition per interval", col = "grey")
hist(as.numeric(missing_values$date), main = "NAs repartition per day", breaks = 80, col="grey")
We notice a uniform distribution of NAs in the intervals. However, NA’s are in only 8 days. Hence, we should take the mean for missing interval across all the days in the dataset.
Hence, the method will be as below: * Average number of steps per interval, across all the days, will be calculated * The dataset activity will be cut into two datasets: activity_with_NAs and activity_without_NAs * We will proceed as we said in the previous part * Both datasets have been merged into a new dataset called new_activity
#Mean of steps per interval
mean_steps_interval <- tapply(activity$steps, activity$interval, mean, na.rm = TRUE)
#Splitting
activity_with_NAs <- activity[is.na(activity$steps), ]
activity_without_NAs <- activity[!is.na(activity$steps), ]
#Replacing missing values in activity_with_NAs
activity_with_NAs$steps <- as.factor(activity_with_NAs$interval)
#using as.factor() because factor() could remove empty levels
levels(activity_with_NAs$steps) <- mean_steps_interval
#Getting integer
levels(activity_with_NAs$steps) <- round(as.numeric(levels(activity_with_NAs$steps)))
activity_with_NAs$steps <- as.integer(as.vector(activity_with_NAs$steps))
#Merging the two datasets
new_activity <- rbind(activity_with_NAs, activity_without_NAs)
First we want to get of the total number of steps taken each day. Two datasets will be compared: one with NAs nd one with flled NAs.
par(mfrow = c(1,2))
activity_steps_day <- aggregate(steps ~ date, data = activity, FUN = sum, na.rm = TRUE)
hist(activity_steps_day$steps, xlab = "Number of steps per day", main = "Steps / Day (without NAs)",col="grey")
new_activity_steps_day <- aggregate(steps ~ date, data = new_activity, FUN = sum, na.rm = TRUE)
hist(new_activity_steps_day$steps, xlab = "Number of steps per day", main = "Steps / Day (NAs filled)", col = "green")
Then, we calculate the new mean and median values and we store them with the previous results in a table with the package xtable.
new_mean_steps <- mean(new_activity_steps_day$steps)
new_median_steps <- median(new_activity_steps_day$steps)
new_mean_steps <- format(new_mean_steps,digits = 1)
new_median_steps <- format(new_median_steps,digits = 1)
results <- data.frame(c(mean_steps, median_steps), c(new_mean_steps, new_median_steps))
colnames(results) <- c("without NAs", "NAs filled")
rownames(results) <- c("mean", "median")
table_results <- xtable(results)
print(table_results, type = "html")
## <!-- html table generated in R 3.3.0 by xtable 1.8-3 package -->
## <!-- Sat Apr 20 21:50:07 2019 -->
## <table border=1>
## <tr> <th> </th> <th> without NAs </th> <th> NAs filled </th> </tr>
## <tr> <td align="right"> mean </td> <td> 10766 </td> <td> 10766 </td> </tr>
## <tr> <td align="right"> median </td> <td> 10765 </td> <td> 10762 </td> </tr>
## </table>
Finally, ignoring missing values does not change the mean value but the median value is reduced only by 0.027% . Besides, both histograms have the same behavior.
#ifelse function is able to consider Saturday and Sunday as the factor level "weekend" and all the others as "weekday"
new_activity$weektype <- ifelse(weekdays(as.Date(new_activity$date)) == "samedi" | weekdays(as.Date(new_activity$date)) == "dimanche", "weekend", "weekday")
#Transforming DayType variable into a factor
new_activity$weektype <- factor(new_activity$weektype)
As I am French, I put “samedi” and “dimanche” but you can easily replace it by “saturday” and “sunday”.
activity_steps_interval_weektype<- aggregate(steps ~ interval + weektype, data = new_activity, FUN = mean)
plot <- ggplot(activity_steps_interval_weektype, aes(interval, steps, color = weektype)) + geom_line() + facet_grid(weektype~.) + facet_wrap(~weektype, ncol = 1, nrow = 2) + labs(x = "Intervals", y = "Average number of steps", title = "Activity")
print(plot)
Overall, it seems that people are ready earlier in the week days with a higher peak in the morning. During the weekends, people are more active all the day than during the weekdays.