Read a file in table format
activity <- read.csv(file = "activity.csv", header = TRUE,
colClasses = c("integer","Date","integer"))
Display the Structure of dataset
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Return the first parts of dataset
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
Histogram of the total number of steps taken each day
activity %>%
filter(steps != is.na(steps)) %>%
group_by(date) %>%
summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
ggplot(aes(sumSteps)) +
geom_histogram(bins = 20, color="black",fill="light blue") +
xlab(label = "") +
theme_light()
Mean and median total number of steps taken per day
activity %>%
group_by(date) %>%
summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
summarise(stepsByDayMean = mean(sumSteps, na.rm = TRUE),
stepsByDayMedian = median(sumSteps, na.rm = TRUE))
## # A tibble: 1 x 2
## stepsByDayMean stepsByDayMedian
## <dbl> <int>
## 1 9354.23 10395
Time series plot of the average number of steps (5 minute interval (x-axis))
activity %>%
filter(steps != is.na(steps)) %>%
group_by(interval) %>%
summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
arrange(interval) %>%
ggplot(aes(y = MeanSteps, x = interval)) +
geom_line() +
xlab("5-minute interval") +
ylab("Average number of steps") +
theme_light()
Max value of above plot
activity %>%
group_by(interval) %>%
summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
summarise(MaxAverageNumberOfSteps = max(MeanSteps))
## # A tibble: 1 x 1
## MaxAverageNumberOfSteps
## <dbl>
## 1 206.1698
Calculate and report of total number of missing values in the dataset
sapply(X = activity, FUN = function(X) sum(is.na(X)))
## steps date interval
## 2304 0 0
Imput missing data - replace values “NA” with mean.
activity.imp <- activity
activity.imp[is.na(activity.imp), "steps"] = mean(activity.imp$steps, na.rm=TRUE)
Histogram of the total number of steps taken each day before and after missing values are imputed
activity$impute <- "before"
activity.imp$impute <- "after"
union_all(x = activity, y = activity.imp) %>%
filter(steps != is.na(steps)) %>%
group_by(date, impute) %>%
summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
ggplot(aes(sumSteps, fill=impute)) +
geom_histogram(bins = 20) +
xlab(label = "") +
theme_light()
Mean and median total number of steps taken each day before and after imputation
union_all(x = activity, y = activity.imp) %>%
group_by(date, impute) %>%
summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
group_by(impute) %>%
summarise(stepByDayMean = mean(sumSteps), stepByDayMedian = median(sumSteps))
## # A tibble: 2 x 3
## impute stepByDayMean stepByDayMedian
## <chr> <dbl> <dbl>
## 1 after 10766.19 10766.19
## 2 before 9354.23 10395.00
Create new factor variables in the dataset
variable weekend has two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
activity$weekday <- weekdays(activity$date, abbreviate = TRUE)
activity$weekday <- as.factor(x = activity$weekday)
activity$weekend <- ifelse(test = (activity$weekday == "sob." | activity$weekday == "niedz."), yes = "weekend", no = "weekday")
activity$weekend <- as.factor(activity$weekend)
Time series plot of the average number of steps (5 minute interval (x-axis)) taken per 5-minute interval across weekdays and weekends
activity %>%
filter(steps != is.na(steps)) %>%
group_by(interval, weekend) %>%
summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
ggplot(aes(y = MeanSteps, x = interval, col=weekend)) +
geom_line() +
xlab("5-minute interval") +
ylab("Average number of steps") +
facet_grid(weekend ~ .) +
theme_light()