Loading and preprocessing the data
- Load the data
library(dplyr)
unzip("activity.zip")
file <- read.csv("activity.csv")
file2 <- read.csv("activity.csv")
- Process/transform the data (if necessary) into a format suitable for your analysis
file$date <- as.Date(file2$date, "%Y-%m-%d")
file2$date <- as.Date(file$date, "%Y-%m-%d")
head(file)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
What is mean total number of steps taken per day?
- Calculate the total number of steps taken per day
steps_date <- file %>% group_by(date) %>% summarise("steps" = sum(steps, na.rm = TRUE))
print(steps_date)
## # A tibble: 61 x 2
## date steps
## <date> <int>
## 1 2012-10-01 0
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 0
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
- Make a histogram of the total number of steps taken each day
hist(steps_date$steps, xlab = "Steps", main = "Frequency of Total Steps per Day", col = "rosybrown1")

- Calculate and report the mean and median of the total number of steps taken per day
report <- data.frame("mean" = mean(steps_date$steps, na.rm = TRUE), "median" = median(steps_date$steps, na.rm = TRUE))
print(report)
## mean median
## 1 9354.23 10395
What is the average daily activity pattern?
- Make a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
steps_interval <- file %>% group_by(interval) %>% summarise("steps" = mean(steps, na.rm = TRUE))
print(steps_interval)
## # A tibble: 288 x 2
## interval steps
## <int> <dbl>
## 1 0 1.72
## 2 5 0.340
## 3 10 0.132
## 4 15 0.151
## 5 20 0.0755
## 6 25 2.09
## 7 30 0.528
## 8 35 0.868
## 9 40 0
## 10 45 1.47
## # ... with 278 more rows
plot(steps_interval$interval, steps_interval$steps, type = "l", ylab = "Steps", xlab = "Interval", main = "Avarage Steps per Interval", col = "rosybrown", lwd = 2)

- Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
max_interval <- steps_interval[steps_interval$steps == max(steps_interval$steps),]
print(max_interval)
## # A tibble: 1 x 2
## interval steps
## <int> <dbl>
## 1 835 206.
Imputing missing values
- Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
data_na <- sum(is.na(file))
print(data_na)
## [1] 2304
- Devise a strategy for filling in all of the missing values in the dataset + 3. Create a new dataset that is equal to the original dataset but with the missing data filled in.
cases <- unique(file2$interval)
for (i in cases) { file2[(is.na(file) & file2$interval == i)] = steps_interval[(steps_interval$interval == i),2]}
head(file2)
## steps date interval
## 1 1.716981 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.09434 2012-10-01 25
- Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
steps_date2 <- file2 %>% group_by(date) %>% summarise("steps" = sum(as.numeric(steps)))
print(steps_date2)
## # A tibble: 61 x 2
## date steps
## <date> <dbl>
## 1 2012-10-01 10766.
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 10766.
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
hist(steps_date2$steps, xlab = "Steps", main = "Frequency of Total Steps per Day (fixed NAs)", col = "rosybrown1")

report2 <- data.frame("NA" = c("corrected", "not corrected"),"mean" = c(mean(steps_date2$steps), mean(steps_date$steps, na.rm = TRUE)), "median" = c(median(steps_date2$steps), median(steps_date$steps, na.rm = TRUE)))
print(report2)
## NA. mean median
## 1 corrected 10766.19 10766.19
## 2 not corrected 9354.23 10395.00
Are there differences in activity patterns between weekdays and weekends?
- Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
file2$day <- weekdays(file2$date, abbreviate = TRUE)
#abbreviations are in portuguese because of my configurations
file2$day[file2$day == "seg" | file2$day == "ter" | file2$day == "qua" | file2$day == "qui" | file2$day == "sex"] = "weekday"
file2$day[file2$day == "sáb" | file2$day == "dom"] = "weekend"
head(file2)
## steps date interval day
## 1 1.716981 2012-10-01 0 weekday
## 2 0.3396226 2012-10-01 5 weekday
## 3 0.1320755 2012-10-01 10 weekday
## 4 0.1509434 2012-10-01 15 weekday
## 5 0.0754717 2012-10-01 20 weekday
## 6 2.09434 2012-10-01 25 weekday
- Make a panel plot containing a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).
steps_day <- file2 %>% group_by(day, interval) %>% summarise("steps" = mean(as.numeric(steps)))
head(steps_day)
## # A tibble: 6 x 3
## # Groups: day [1]
## day interval steps
## <chr> <int> <dbl>
## 1 weekday 0 2.25
## 2 weekday 5 0.445
## 3 weekday 10 0.173
## 4 weekday 15 0.198
## 5 weekday 20 0.0990
## 6 weekday 25 1.59
with(subset(steps_day, day == "weekday"), plot(interval, steps, type = "l", col = "brown1", main = "Avarage of steps per day", xlab = "Interval", ylab = "Steps"))
with(subset(steps_day, day == "weekend"), points(interval, steps, type = "l", col = "lightblue 4"))
legend("topright", legend = c("weekday", "weekdend"), col = c("brown1","lightblue3"), lwd= 2)
