activity <- read.csv("activity.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
unique(activity$date)
## [1] "2012-10-01" "2012-10-02" "2012-10-03" "2012-10-04" "2012-10-05"
## [6] "2012-10-06" "2012-10-07" "2012-10-08" "2012-10-09" "2012-10-10"
## [11] "2012-10-11" "2012-10-12" "2012-10-13" "2012-10-14" "2012-10-15"
## [16] "2012-10-16" "2012-10-17" "2012-10-18" "2012-10-19" "2012-10-20"
## [21] "2012-10-21" "2012-10-22" "2012-10-23" "2012-10-24" "2012-10-25"
## [26] "2012-10-26" "2012-10-27" "2012-10-28" "2012-10-29" "2012-10-30"
## [31] "2012-10-31" "2012-11-01" "2012-11-02" "2012-11-03" "2012-11-04"
## [36] "2012-11-05" "2012-11-06" "2012-11-07" "2012-11-08" "2012-11-09"
## [41] "2012-11-10" "2012-11-11" "2012-11-12" "2012-11-13" "2012-11-14"
## [46] "2012-11-15" "2012-11-16" "2012-11-17" "2012-11-18" "2012-11-19"
## [51] "2012-11-20" "2012-11-21" "2012-11-22" "2012-11-23" "2012-11-24"
## [56] "2012-11-25" "2012-11-26" "2012-11-27" "2012-11-28" "2012-11-29"
## [61] "2012-11-30"
totalsteps <- activity %>% group_by(date) %>% summarise(total = sum(steps, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
totalsteps
## # A tibble: 61 x 2
## date total
## <chr> <int>
## 1 2012-10-01 0
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 0
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
ggplot(totalsteps, aes(total)) + geom_histogram(binwidth = 2500, fill = blues9) + xlab("Total number of steps per day") + ylab("count of total number of steps per day") + ggtitle("Histogram of steps per day")
mean(totalsteps$total, na.rm = TRUE)
## [1] 9354.23
median(totalsteps$total, na.rm = TRUE)
## [1] 10395
avesteps <- activity %>% group_by(interval) %>% summarise(average = mean(steps, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
avesteps$interval <- (avesteps$interval)*100
ggplot(avesteps, aes(x= interval, y = average)) + geom_line(color = "steelblue", size = 1, alpha = 0.8)
avesteps[which.max(avesteps$average),] #maximum number of steps lies at interval 8.35
## # A tibble: 1 x 2
## interval average
## <dbl> <dbl>
## 1 83500 206.
colSums(is.na(activity))
## steps date interval
## 2304 0 0
NA values will be replaced by the average number of steps in the 5 minute interval Create a new column complete_steps with all NA values replaced
activity$Complete_steps <- ifelse(is.na(activity$steps), round(avesteps$average[match(activity$interval, avesteps$interval)],0), activity$steps)
#merge 2 columns complete_steps and steps in the activity dataset
Finalactivity <- data.frame(steps = activity$Complete_steps, interval = activity$interval, date = activity$date)
head(Finalactivity, n = 5) #first 5 rows of new activity data
## steps interval date
## 1 2 0 2012-10-01
## 2 NA 5 2012-10-01
## 3 NA 10 2012-10-01
## 4 NA 15 2012-10-01
## 5 NA 20 2012-10-01
Finalactivity_total <- Finalactivity %>% group_by(date) %>% summarise(total = sum(steps, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(Finalactivity_total, aes(total)) + geom_histogram(binwidth = 2500, fill = blues9)
mean(Finalactivity_total$total)
## [1] 9354.492
median(Finalactivity_total$total)
## [1] 10395
# there is not much impact on the mean and median.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
activity$date <- as.Date(activity$date)
activity$day <- weekdays(activity$date)
unique(activity$day)
## [1] "Monday" "Tuesday" "Wednesday" "Thursday" "Friday" "Saturday"
## [7] "Sunday"
activity$daytype <- ifelse(activity$day == "Saturday" | activity$day == "Sunday",'Weekend','Weekday')
unique(activity$daytype)
## [1] "Weekday" "Weekend"
activity_grouped <- activity %>% group_by(daytype, interval) %>% summarise(ave = mean(steps, na.rm = TRUE))
## `summarise()` regrouping output by 'daytype' (override with `.groups` argument)
ggplot(activity_grouped, aes(x = interval, y = ave)) + geom_line(color = "steelblue") + facet_grid(daytype ~.) + ylab("average number of steps")
## End