Install the required packages and library which are to use in performing the

various task like data cleaning,merging,visualization and generating report.

#PREPARE install.packages(“tidyverse”) install.packages(“lubridate”) install.packages(“dplyr”) install.packages(“ggplot2”) install.packages(“tidyr”) install.packages(“janitor”) install.packages(“here”)

library(tidyverse) library(lubridate) library(dplyr) library(ggplot2) library(tidyr) library(janitor) library(here)

#Importing the data set required to perform the analysis.

activity <- read_csv(“dailyActivity_merged.csv”) calories <- read_csv(“dailyCalories_merged.csv”) intensities <- read_csv(“dailyIntensities_merged.csv”) weight <- read_csv(“weightLogInfo_merged.csv”) sleep <- read_csv(“sleepDay_merged.csv”)

PROCESS

cleaning the data

#1.checking the first few row by using head() function head(activity) head(calories) head(intensities) head(weight) head(sleep)

#2.Looking at the colummn names by using colnames() function colnames(activity) colnames(calories) colnames(intensities) colnames(weight) colnames(sleep)

#3 checking the unique id in each dataset. n_distinct(activity\(Id) n_distinct(calories\)Id)
n_distinct(intensities\(Id) n_distinct(sleep\)Id) n_distinct(weight$Id)

sleep <- unique(sleep) nrow(sleep)

sleep_new <- sleep %>% separate(SleepDay, c(“Date”, “Time”), ” “) View(sleep_new)

weight_new <- weight %>% separate(Date, c(“Date”, “Time”), ” “) View(weight_new)

activity\(ActivityDate = as.Date(activity\)ActivityDate, “%m/%d/%Y”) View(activity)

sleep_new\(Date = as.Date(sleep_new\)Date, “%m/%d/%Y”) View(sleep_new)

weight_new\(Date = as.Date(weight_new\)Date,“%m/%d/%Y”) View(weight_new)

activity %>% select(TotalSteps, TotalDistance, VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes, SedentaryMinutes, Calories) %>% summary()

#sleep_day_new dataframe summary

sleep_new %>% select(TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed) %>% summary() weight %>% select(WeightKg, WeightPounds, BMI) %>%
summary()

#4 Formatting the date in order to have the uniformly present data. activity_new <- activity %>% mutate( # reformat variable as POSIXct to represent date and time ActivityDate = parse_date_time(ActivityDate, “%m/%d/%y”), # create new variable and format as date only activity_date_ymd = as.Date(ActivityDate), # create new variables for day of week and time of week day_of_week = weekdays(as.Date(activity_date_ymd)), time_of_week = case_when( day_of_week %in% c(“Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”) ~ “Weekday”, day_of_week %in% c(“Saturday”, “Sunday”) ~ “Weekend”) )

calories_new<-calories%>% mutate( ActivityDay = parse_date_time(ActivityDay, “%m/%d/%Y”), activity_date_ymd = as.Date(ActivityDay, “%Y/%m/%d”), day_of_week = weekdays(as.Date(ActivityDay)), time_of_week = case_when( day_of_week %in% c(“Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”) ~ “Weekday”, day_of_week %in% c(“Saturday”, “Sunday”) ~ “Weekend”) ) intensities_new <- intensities%>% mutate( ActivityDay = parse_date_time(ActivityDay, “%m/%d/%Y”), activity_date_ymd = as.Date(ActivityDay, “%Y/%m/%d”), day_of_week = weekdays(as.Date(ActivityDay)), time_of_week = case_when( day_of_week %in% c(“Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”) ~ “Weekday”, day_of_week %in% c(“Saturday”, “Sunday”) ~ “Weekend”) )

sleep_new<- sleep%>% mutate( SleepDay = parse_date_time(SleepDay, “%m/%d/%Y %I:%M:%S %p”), activity_date_ymd = as.Date(SleepDay, “%Y/%m/%d”), day_of_week = weekdays(as.Date(SleepDay)), time_of_week = case_when( day_of_week %in% c(“Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”) ~ “Weekday”, day_of_week %in% c(“Saturday”, “Sunday”) ~ “Weekend”) )

weight_new<- weight%>% mutate( Date = parse_date_time(Date, “%m/%d/%Y %I:%M:%S %p”), activity_date_ymd = as.Date(Date, “%Y/%m/%d”), activity_time = format(Date, format = “%I:%M:%S %p”), day_of_week = weekdays(as.Date(Date)), time_of_week = case_when( day_of_week %in% c(“Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”) ~ “Weekday”, day_of_week %in% c(“Saturday”, “Sunday”) ~ “Weekend”), hour_of_day = as.POSIXct(Date, format = “%I:%M:%S %p”) )

#Plotting the graphs for visualizations using ggplot2 library

colnames(activity_new) ggplot(data=activity_new, aes(x=TotalSteps, y=Calories))+ geom_point(color=“blue”) ggplot(data=activity_new, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point(color=“red”) ggplot(data=activity_new, aes(x = Calories)) + geom_histogram(fill= “skyblue”,color =“black”) ggplot(data=activity_new, aes(x = ActivityDate, y = Calories)) + geom_line(color = “red”)

ggplot(data = activity_new, aes(x = day_of_week, y = Calories)) + geom_point(position = position_jitter(width = 0.2), alpha = 0.3, color = “red”) + ggtitle(“Calories Burned by Day of the Week”)

colnames(combined_data_temp) colnames(weight_new) combined_data_temp<-merge(x=activity_new,y=sleep_new,by=c(‘Id’,“activity_date_ymd”, “day_of_week”, “time_of_week”),all.x = TRUE) n_distinct(combined_data_temp$Id)

combined_data_1 <- merge(x=combined_data_temp, y=weight_new, by = c(‘Id’,“activity_date_ymd”, “day_of_week”, “time_of_week”),all.x=TRUE) combined_data_2<-merge(intensities_new,calories_new,by=c(‘Id’,“activity_date_ymd”, “day_of_week”, “time_of_week”),all.x=TRUE)

colnames(combined_data_1) colnames(combined_data_2)

combined_data_1 %>% select(TotalSteps, TotalDistance, Calories, WeightKg, BMI, SleepDay,TotalMinutesAsleep,TotalTimeInBed ) %>% summary()

ggplot(data = combined_data_1, aes(x = ActivityDate, y = TotalSteps)) + stat_summary(fun = mean, geom = “bar”, fill = “pink”, color = “black”) + theme_minimal() + labs(title = “Average Total Steps by Day of Week”)

ggplot(data = combined_data_1, aes(x = activity_date_ymd, y = TotalSteps)) + geom_jitter(color = “blue”, width = 0.2, alpha = 0.5) + theme_minimal() + labs(title = “Total Steps by Day of Week (Dot Plot with Jitter)”) head(combined_data_2)

ggplot(data = combined_data_1, aes(x = day_of_week, y = WeightKg, group = 1)) + stat_summary(fun = mean, geom = “line”, color = “red”, size = 1) + stat_summary(fun = mean, geom = “point”, color = “red”, size = 2) + theme_minimal() + labs(title = “Average Weight by Day of Week (Line Plot)”)

ggplot(data = activity_new, aes(x = day_of_week, y = Calories)) + geom_point(position = position_jitter(width = 0.2), alpha = 0.3, color = “red”) + ggtitle(“Calories Burned by Day of the Week”)

ggplot(data = combined_data_1, aes(x = day_of_week, y = Calories)) + stat_summary(fun = mean, geom = “bar”, fill = “coral”, color = “darkred”) + theme_minimal() + labs(title = “Average Calories Burned by Day of Week”)

ggplot(data = combined_data_1, aes(x = day_of_week, y = TotalDistance, group = 1)) + stat_summary(fun = mean, geom = “line”, color = “blue”, size = 1) + stat_summary(fun = mean, geom = “point”, color = “blue”, size = 2) + theme_minimal() + labs(title = “Average Total Distance by Day of Week (Line Plot)”)

head(combined_data_2)

ggplot(data=combined_data_2, aes(x=activity_date_ymd, y=SedentaryMinutes)) + geom_point() + geom_smooth(method=“loess”, se=FALSE) + ggtitle(“Trends in Sedentary Minutes Over Time”)

colnames(combined_data_1) ggplot(data=combined_data_1, aes(x=VeryActiveMinutes, y=TotalSteps)) + geom_point(size=2, alpha=0.6) + # Color by CaloriesBurned, adjust size and transparency geom_smooth(method=“lm”, se=FALSE, color=“blue”) + # Use linear fit for simplicity ggtitle(“Relationship Between Very Active Minutes and Total Steps”) + xlab(“Very Active Minutes”) + # Label for x-axis ylab(“Total Steps”) + # Label for y-axis scale_color_viridis_c() + # Use color scale for continuous variable theme_minimal()

ggplot(data=combined_data_2, aes(x=VeryActiveMinutes, y=SedentaryMinutes)) + geom_point() + geom_smooth(method=“loess”, se=FALSE) + ggtitle(“Relationship Between Very Active Minutes and Sedentary Minutes by Day of Week”)

ggplot(data=combined_data_1, aes(x=FairlyActiveMinutes, y=TotalSteps)) + geom_point() + geom_smooth(method=“loess”, se=FALSE) + ggtitle(“Relationship Between Very Active Minutes and Total Steps”)

ggplot(data=combined_data_1, aes(x=VeryActiveMinutes, y=Calories)) + stat_bin2d() + ggtitle(“Heatmap of Very Active Minutes vs Calories”)

ggplot(data = combined_data_1, aes(x = VeryActiveMinutes, y = TotalDistance)) + geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) + geom_smooth(method = “loess”) + ggtitle(“Total Distance by Very Active Minutes with Jitter”)

ggplot(data = combined_data_1, aes(x = activity_date_ymd, y = VeryActiveMinutes)) + geom_jitter(width = 0.2, alpha = 0.6) +
stat_summary(fun = mean, geom = “line”, aes(group = 1), color = “red”, size = 1) + ggtitle(“Variation in Very Active Minutes Across Days of the Week”)

ggplot(data = combined_data_1, aes(x = VeryActiveMinutes, y = TotalDistance, size = VeryActiveMinutes)) + geom_point(alpha = 0.6,color=‘blue’) + ggtitle(“Bubble Plot of Very Active Minutes, Total Distance, and Sedentary Minutes”) + labs(size = “Sedentary Minutes”)

ggplot(data = combined_data_1, aes(x = VeryActiveMinutes, y = TotalDistance)) + geom_jitter(width = 0.2, height = 0.2, alpha = 0.5) + geom_smooth(method = “loess”) + ggtitle(“Total Distance by Very Active Minutes with Jitter”) colnames(combined_data_1)

install.packages(“knitr”) knit(“your_file.Rmd”)