This is a r markdown file for the project 1 in resproducible research course. Make sure you use the html version to see plots along with codes
library(readr)
## Warning: package 'readr' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
destfile <- "D:/Learn/eLearning Database/R for Data Science/Reproducible Research/Week I/repdata_data_activity.zip"
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip", destfile = destfile )
unzip("repdata_data_activity.zip")
#Read Data
activity <- read_csv("activity.csv")
## Rows: 17568 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): steps, interval
## date (1): date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data Cleaning
activity <- as_tibble(activity)
spd <- activity %>% group_by(date) %>% summarise(steps = sum(steps))
spd
## # A tibble: 61 x 2
## date steps
## <date> <dbl>
## 1 2012-10-01 NA
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 NA
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
ggplot(spd, aes(steps)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing non-finite values (stat_bin).
spd %>% select(steps) %>% drop_na() %>% summarise(mean = mean(steps), median=median(steps))
## # A tibble: 1 x 2
## mean median
## <dbl> <dbl>
## 1 10766. 10765
activity %>% drop_na() %>%
group_by(interval) %>%
summarise(steps = mean(steps)) %>%
ggplot(aes(x=interval, y=steps)) +
geom_line()
activity %>% drop_na() %>%
group_by(interval) %>%
summarise(steps = mean(steps)) %>% arrange(desc(steps))
## # A tibble: 288 x 2
## interval steps
## <dbl> <dbl>
## 1 835 206.
## 2 840 196.
## 3 850 183.
## 4 845 180.
## 5 830 177.
## 6 820 171.
## 7 855 167.
## 8 815 158.
## 9 825 155.
## 10 900 143.
## # ... with 278 more rows
activity %>% select(steps) %>% count(is.na(.))
## # A tibble: 2 x 2
## `is.na(.)`[,"steps"] n
## <lgl> <int>
## 1 FALSE 15264
## 2 TRUE 2304
refill_for_na <- mean(activity$steps, na.rm = T)
new_data <- activity
for (i in 1:length(new_data$steps)) {
if (is.na(new_data$steps[i]))
new_data$steps[i] <- refill_for_na
}
new_data
## # A tibble: 17,568 x 3
## steps date interval
## <dbl> <date> <dbl>
## 1 37.4 2012-10-01 0
## 2 37.4 2012-10-01 5
## 3 37.4 2012-10-01 10
## 4 37.4 2012-10-01 15
## 5 37.4 2012-10-01 20
## 6 37.4 2012-10-01 25
## 7 37.4 2012-10-01 30
## 8 37.4 2012-10-01 35
## 9 37.4 2012-10-01 40
## 10 37.4 2012-10-01 45
## # ... with 17,558 more rows
new_data %>% group_by(date) %>% summarise(steps = sum(steps)) %>%
ggplot(aes(steps)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
new_data %>% group_by(date) %>% summarise(steps = sum(steps)) %>%
summarise(mean= mean(steps), median = median(steps))
## # A tibble: 1 x 2
## mean median
## <dbl> <dbl>
## 1 10766. 10766.
new_data$day <- ifelse(
weekdays(new_data$date) %in% c("Saturday", "Sunday"),
"weekday",
"weekend"
)