library(dplyr)
library(ggplot2)
library(readr)
activity <- read_csv("C:/Users/tuuye/Desktop/Data Science course/Reproducible Research/activity.csv")
# Dimension of the data
dim(activity)
## [1] 17568 3
The dataset has 17568 observation and 3 variables
head(activity)
## # A tibble: 6 x 3
## steps date interval
## <dbl> <date> <dbl>
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
activity$day <- weekdays(as.Date(activity$date))
activity$DateTime <- as.POSIXct(activity$date)
head(activity)
## # A tibble: 6 x 5
## steps date interval day DateTime
## <dbl> <date> <dbl> <chr> <dttm>
## 1 NA 2012-10-01 0 Monday 2012-09-30 17:00:00
## 2 NA 2012-10-01 5 Monday 2012-09-30 17:00:00
## 3 NA 2012-10-01 10 Monday 2012-09-30 17:00:00
## 4 NA 2012-10-01 15 Monday 2012-09-30 17:00:00
## 5 NA 2012-10-01 20 Monday 2012-09-30 17:00:00
## 6 NA 2012-10-01 25 Monday 2012-09-30 17:00:00
sum_steps <- aggregate(activity$steps ~ activity$date, FUN = sum)
head(sum_steps)
## activity$date activity$steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
We will rename the columns
colnames(sum_steps) <- c('Date', 'Steps')
head(sum_steps)
## Date Steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
hist(sum_steps$Steps, breaks = 5, main = 'Histogram of the total number of steps taken each day', xlab = 'Steps', col = 'brown')
MEAN <- round(mean(sum_steps$Steps))
MEDIAN <- round(median(sum_steps$Steps))
print(paste('MEAN of the total number of steps taken per day =', MEAN))
## [1] "MEAN of the total number of steps taken per day = 10766"
print(paste('MEDIAN of the total number of steps taken per day=', MEDIAN))
## [1] "MEDIAN of the total number of steps taken per day= 10765"
library(plyr)
library(ggplot2)
clean <- na.omit(activity)
interval_steps <- ddply(clean, .(interval), summarize, Avg = mean(steps))
ggplot(interval_steps, aes(x = interval, y = Avg)) +
geom_line(col = 'blue') +
labs(title = 'Time series plot of the 5-minute interval and the average number of steps taken, averaged across all days', xlab = 'Interval', ylab = 'Average number of steps')
interval_steps[which.max(interval_steps$Avg),]
## interval Avg
## 104 835 206.1698
na_count <- colSums(is.na(activity))
na_count
## steps date interval day DateTime
## 2304 0 0 0 0
We will use average number of step from cleaning data to replace all NA in the original data
data_fill_NA <- activity
data_fill_NA$steps[is.na(data_fill_NA$steps)] <- round(mean(clean$steps))
head(data_fill_NA)
## # A tibble: 6 x 5
## steps date interval day DateTime
## <dbl> <date> <dbl> <chr> <dttm>
## 1 37 2012-10-01 0 Monday 2012-09-30 17:00:00
## 2 37 2012-10-01 5 Monday 2012-09-30 17:00:00
## 3 37 2012-10-01 10 Monday 2012-09-30 17:00:00
## 4 37 2012-10-01 15 Monday 2012-09-30 17:00:00
## 5 37 2012-10-01 20 Monday 2012-09-30 17:00:00
## 6 37 2012-10-01 25 Monday 2012-09-30 17:00:00
sum_steps_fill <- aggregate(data_fill_NA$steps ~ data_fill_NA$date, FUN=sum)
colnames(sum_steps_fill) <- c('Date', "Steps")
MEAN_FILL <- round(mean(sum_steps_fill$Steps))
MEDIAN_FILL <- round(median(sum_steps_fill$Steps))
MEAN_FILL
## [1] 10752
MEDIAN_FILL
## [1] 10656
Before filling NA : Mean = 10766, Median = 10765 After filling NA: Mean = 10752, Median = 10656
hist(sum_steps_fill$Steps, breaks = 5, xlab = "Steps", main = "Total Steps per Day with NAs Fixed", col='Purple')
**The overall shape of the distribution has not change even though the new mean and median decreased*
data_fill_NA$DayCategory <- ifelse(data_fill_NA$day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
library(lattice)
interval_steps2 <- ddply(data_fill_NA, .(interval, DayCategory), summarize, Avg = mean(steps))
xyplot(Avg~interval|DayCategory, data=interval_steps2, type="l", layout = c(1,2),
main="Average Steps per Interval Based on Type of Day",
ylab="Average Number of Steps", xlab="Interval")