library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
clean the weight and bmi data for fitbit users
weight<-read.csv("C:\\Users\\deyvi\\OneDrive\\Desktop\\R files\\mturkfitbit_export_3.12.16-4.11.16\\Fitabase Data 3.12.16-4.11.16\\weightLogInfo_merged.csv")
library(tidyverse)
weight %>%
group_by_all() %>%
filter(n()>1) %>%
ungroup()
## # A tibble: 0 × 8
## # ℹ 8 variables: Id <dbl>, Date <chr>, WeightKg <dbl>, WeightPounds <dbl>,
## # Fat <int>, BMI <dbl>, IsManualReport <chr>, LogId <dbl>
#no duplicate rows
n_distinct(weight$Id) #11
## [1] 11
length(weight) #8 variables or columns
## [1] 8
names(weight)
## [1] "Id" "Date" "WeightKg" "WeightPounds"
## [5] "Fat" "BMI" "IsManualReport" "LogId"
length(weight$Id) #33 rows
## [1] 33
weight %>%
group_by(Id) %>%
filter(n()>1) %>%
ungroup()
## # A tibble: 25 × 8
## Id Date WeightKg WeightPounds Fat BMI IsManualReport LogId
## <dbl> <chr> <dbl> <dbl> <int> <dbl> <chr> <dbl>
## 1 2873212765 4/6/2016… 56.7 125. NA 21.5 True 1.46e12
## 2 2873212765 4/7/2016… 57.2 126. NA 21.6 True 1.46e12
## 3 6962181067 3/30/201… 61.5 136. NA 24.0 True 1.46e12
## 4 6962181067 3/31/201… 61.5 136. NA 24.0 True 1.46e12
## 5 6962181067 4/1/2016… 60.9 134. NA 23.8 True 1.46e12
## 6 6962181067 4/2/2016… 61.2 135. NA 23.9 True 1.46e12
## 7 6962181067 4/3/2016… 61.5 136. NA 24.0 True 1.46e12
## 8 6962181067 4/4/2016… 62.4 138. NA 24.4 True 1.46e12
## 9 6962181067 4/5/2016… 61.7 136. NA 24.1 True 1.46e12
## 10 6962181067 4/6/2016… 62.2 137. NA 24.3 True 1.46e12
## # ℹ 15 more rows
#25 of 33 rows contain duplicates
ggplot(weight, aes(WeightKg,BMI))+
geom_point()

class(weight$Id)
## [1] "numeric"
weight$Id<-as.factor(weight$Id)
ggplot(weight,aes(WeightKg,BMI))+
geom_point(aes(color = Id,shape = Id))
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 11 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(weight,aes(WeightKg,BMI))+
geom_point(aes(color = Id))

class(weight$Date)
## [1] "character"
weight$Date
## [1] "4/5/2016 11:59:59 PM" "4/10/2016 6:33:26 PM" "4/3/2016 11:59:59 PM"
## [4] "4/6/2016 11:59:59 PM" "4/7/2016 11:59:59 PM" "4/5/2016 11:59:59 PM"
## [7] "3/30/2016 11:59:59 PM" "4/8/2016 11:59:59 PM" "4/4/2016 11:59:59 PM"
## [10] "3/30/2016 11:59:59 PM" "3/31/2016 11:59:59 PM" "4/1/2016 11:59:59 PM"
## [13] "4/2/2016 11:59:59 PM" "4/3/2016 11:59:59 PM" "4/4/2016 11:59:59 PM"
## [16] "4/5/2016 11:59:59 PM" "4/6/2016 11:59:59 PM" "4/7/2016 11:59:59 PM"
## [19] "4/8/2016 11:59:59 PM" "4/9/2016 11:59:59 PM" "4/10/2016 11:59:59 PM"
## [22] "4/11/2016 11:59:59 PM" "4/12/2016 11:59:59 PM" "4/7/2016 11:59:59 PM"
## [25] "4/1/2016 6:49:40 AM" "4/4/2016 6:53:43 AM" "4/5/2016 6:40:15 AM"
## [28] "4/6/2016 6:49:31 AM" "4/7/2016 6:15:08 AM" "4/8/2016 6:39:44 AM"
## [31] "4/9/2016 8:06:12 AM" "4/11/2016 6:58:09 AM" "4/12/2016 6:47:11 AM"
weight<-weight %>%
separate(Date,c("day","time","meridiem"), sep=" ")
#reassign the data frame so the df is not printed in the console
#but updates the data frame
View(weight)
#if you want to do the opposite of separate you can
#use the paste function, may need to use mutate as well
weight$meridiem<-as.factor(weight$meridiem)
Plots for weight and BMI
ggplot(weight,aes(WeightKg,BMI))+
geom_point(aes(color = Id))+
facet_wrap(~meridiem)+
xlim(c(0,140))+
ylim(c(0,50))

#plotting for day should work even if it's a string
ggplot(weight,aes(day,BMI))+
geom_point(aes(color = Id))+
facet_wrap(~meridiem)+
theme(axis.text.x = element_text(angle = 70, vjust = 0.5))

#coincidentially the dates are ordered in time as strings!
#colors are not super clear, make a plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
bmi<-ggplot(weight,aes(day,BMI))+
geom_point(aes(color = Id, shape = meridiem))+
theme(axis.text.x = element_text(angle = 70, vjust = 0.5))
bmi

ggplotly(bmi)#not working as facet_wrap ~meridiem,
#I changed the shape to meridiem then..
#plotly is not rendering. I will reinstall plotly
install.packages("plotly")
## Warning: package 'plotly' is in use and will not be installed
#maybe dual axis?
ggplot(weight,aes(day,WeightKg))+
geom_point(aes(color = Id, shape = meridiem))+
theme(axis.text.x = element_text(angle = 70, vjust = 0.5))

#mean weights and bmi
weight %>%
group_by(Id) %>%
summarize(avg_weight = mean(WeightKg),avg_bmi = mean(BMI)) %>%
arrange(desc(avg_weight))
## # A tibble: 11 × 3
## Id avg_weight avg_bmi
## <fct> <dbl> <dbl>
## 1 1927972279 130. 46.2
## 2 4702921684 99.7 26.1
## 3 4445114986 92.4 35.0
## 4 2891001357 88.4 25.0
## 5 8877689391 85.8 25.7
## 6 8253242879 75.6 29.5
## 7 4558609924 69.4 27.1
## 8 2347167796 63.4 24.8
## 9 6962181067 61.9 24.2
## 10 2873212765 57.0 21.6
## 11 1503960366 53.3 23.0
#DATA IS INCOMPLETE!
Daily Activity data
daily_activity<-read.csv("C:\\Users\\deyvi\\OneDrive\\Desktop\\R files\\mturkfitbit_export_3.12.16-4.11.16\\Fitabase Data 3.12.16-4.11.16\\dailyActivity_merged.csv")
class(daily_activity$Id)
## [1] "numeric"
daily_activity$Id<-as.factor(daily_activity$Id)
#group_by numeric will not work, convert Id to string or factor
activity<-daily_activity %>%
group_by(Id) %>%
summarize(avg_VAM = sum(VeryActiveMinutes),
avg_FAM = sum(FairlyActiveMinutes),
avg_LAM = sum(LightlyActiveMinutes))
plot_ly(activity,type = "scatter3d", mode="markers",
x = ~avg_VAM, y = ~avg_FAM, z = ~avg_LAM,
color = ~Id, colors = c("pink","green3","cornflowerblue"))
ggplot(activity,aes(avg_FAM,avg_LAM))+
geom_point(aes(color=Id))

ggplot(activity,aes(avg_VAM,avg_FAM))+
geom_point(aes(color=Id))+
geom_smooth(se=F)+
geom_smooth(method="lm",color="cyan",se=F)+
geom_smooth(method="gam",color="yellow",se=F)+
stat_smooth(method='lm', formula = y ~ poly(x,3),
color="green")#interesting
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

ggplot(activity,aes(avg_LAM,avg_VAM))+
geom_point(aes(color=Id))

#ref https://www.statology.org/polynomial-regression-r/
#for now a distinct approach
pred1<-predict(lm(activity$avg_FAM~poly(activity$avg_VAM,3)),
data=activity[activity$avg_VAM < 300,])
#error: https://www.statology.org/error-in-evalpredvars-data-env-object-not-found/
#work around:
activity_mod<-activity %>%
filter(avg_VAM < 300)
prediction1<-predict(lm(avg_FAM ~ poly(avg_VAM,3),data = activity_mod))
ggplot(activity,aes(avg_VAM,avg_FAM))+
geom_point(aes(color=Id))+
geom_line(data=activity_mod,aes(x=avg_VAM,y=prediction1),
color="blue")

#okay now add the second regression prediction
activity_mod2<-activity %>%
filter(avg_VAM > 300)
prediction2<-predict(lm(avg_FAM ~ avg_VAM,data = activity_mod2))
ggplot(activity,aes(avg_VAM,avg_FAM))+
geom_point(aes(color=Id))+
geom_line(data=activity_mod,aes(x=avg_VAM,y=prediction1),
color="blue",lwd=1)+
geom_line(data = activity_mod2,aes(x=avg_VAM,y=prediction2),
color="cornflowerblue",lwd=1)+
annotate("text",x=250,y=600,
label="lm(avg_FAM~poly(avg_VAM,3) for <300",
color="blue",fontface="bold")+
annotate("text",x=250,y=550,
label="lm(avg_FAM~avg_VAM) for >300",
color="cornflowerblue",fontface="bold")

#note: data point for 2891001357 has unusually high FAM at 0 VAM
#might be an outlier
Sleep Minutes Data
min_sleep<-read_csv("C:\\Users\\deyvi\\OneDrive\\Desktop\\R files\\mturkfitbit_export_3.12.16-4.11.16\\Fitabase Data 3.12.16-4.11.16\\minuteSleep_merged.csv")
## Rows: 198559 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (3): Id, value, logId
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(min_sleep)
## # A tibble: 6 × 4
## Id date value logId
## <dbl> <chr> <dbl> <dbl>
## 1 1503960366 3/13/2016 2:39:30 AM 1 11114919637
## 2 1503960366 3/13/2016 2:40:30 AM 1 11114919637
## 3 1503960366 3/13/2016 2:41:30 AM 1 11114919637
## 4 1503960366 3/13/2016 2:42:30 AM 1 11114919637
## 5 1503960366 3/13/2016 2:43:30 AM 1 11114919637
## 6 1503960366 3/13/2016 2:44:30 AM 1 11114919637
#value 1,2,3 mean 1 is asleep, 2 is restless, 3 is awake
n_distinct(min_sleep$Id) #23 Ids
## [1] 23
min_sleep %>%
group_by(Id) %>%
summarize(avg_slpstate = mean(value)) %>%
arrange(desc(avg_slpstate)) # top 5 people with least sleep
## # A tibble: 23 × 2
## Id avg_slpstate
## <dbl> <dbl>
## 1 3977333714 1.34
## 2 1844505072 1.32
## 3 1644430081 1.31
## 4 5553957443 1.11
## 5 2347167796 1.11
## 6 6775888955 1.09
## 7 1503960366 1.09
## 8 8378563200 1.08
## 9 4445114986 1.08
## 10 2026352035 1.08
## # ℹ 13 more rows
# 3977333714 1.34
# 1844505072 1.32
# 1644430081 1.31
# 5553957443 1.11
# 2347167796 1.11
min_sleep %>%
group_by(Id) %>%
summarize(avg_slpstate = mean(value)) %>%
arrange(avg_slpstate) #top 5 Ids with MOST sleep
## # A tibble: 23 × 2
## Id avg_slpstate
## <dbl> <dbl>
## 1 8053475328 1.01
## 2 4558609924 1.02
## 3 7086361926 1.04
## 4 6962181067 1.05
## 5 1927972279 1.05
## 6 4020332650 1.06
## 7 4319703577 1.06
## 8 2022484408 1.06
## 9 5577150313 1.06
## 10 6117666160 1.06
## # ℹ 13 more rows
# 8053475328 1.01
# 4558609924 1.02
# 7086361926 1.04
# 6962181067 1.05
# 1927972279 1.05
class(min_sleep$date)
## [1] "character"
min_sleep<-min_sleep %>%
separate(date, c("day", "time", "meridiem"),sep= " ")
min_sleep<-min_sleep %>%
separate(day, c("month","day", "year", sep="/"))
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 198559 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
class(min_sleep$month)
## [1] "character"
min_sleep$month<-as.factor(min_sleep$month)
levels(min_sleep$month)#only two months okay..
## [1] "3" "4"
# 3 and 4 numericals (factors)
min_sleep$month<-as.character(min_sleep$month)
min_sleep$month[min_sleep$month == "3"]<-"03"
min_sleep$month[min_sleep$month == "4"]<-"04"
min_sleep<-min_sleep %>%
mutate(ymd = paste(min_sleep$year,
min_sleep$month,min_sleep$day, sep = "-"))
min_sleep$ymd<-as.Date(min_sleep$ymd)
class(min_sleep$ymd)
## [1] "Date"
class(min_sleep$value)
## [1] "numeric"
class(min_sleep$Id)
## [1] "numeric"
ggplot(min_sleep, aes(ymd,value))+
geom_point(aes(color = as.factor(Id),
shape = as.factor(value)))+
facet_wrap(~Id)

#okay i think the issue of overlap is that I did not include time
#fix it
class(min_sleep$time)
## [1] "character"
min_sleep<-min_sleep %>%
separate(time, c("hours", "minutes", "seconds", sep=":"))
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 198559 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
min_sleep<-min_sleep %>%
select(-`:`)
min_sleep$hours<-as.factor(min_sleep$hours)
levels(min_sleep$hours)
## [1] "1" "10" "11" "12" "2" "3" "4" "5" "6" "7" "8" "9"
min_sleep$hours<-as.character(min_sleep$hours)
min_sleep$hours[min_sleep$hours == "1"]<-"01"
min_sleep$hours[min_sleep$hours == "2"]<-"02"
min_sleep$hours[min_sleep$hours == "3"]<-"03"
min_sleep$hours[min_sleep$hours == "4"]<-"04"
min_sleep$hours[min_sleep$hours == "5"]<-"05"
min_sleep$hours[min_sleep$hours == "6"]<-"06"
min_sleep$hours[min_sleep$hours == "7"]<-"07"
min_sleep$hours[min_sleep$hours == "8"]<-"08"
min_sleep$hours[min_sleep$hours == "9"]<-"09"
min_sleep<-min_sleep %>%
mutate(date_time = paste(min_sleep$ymd, min_sleep$time, sep = " "))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_time = paste(min_sleep$ymd, min_sleep$time, sep = " ")`.
## Caused by warning:
## ! Unknown or uninitialised column: `time`.
min_sleep<-min_sleep %>%
mutate(hms = paste(min_sleep$hours,min_sleep$minutes,
min_sleep$seconds, sep = ":"))
min_sleep<-min_sleep %>%
mutate(yr_m_d_hms = paste(min_sleep$ymd,min_sleep$hms, sep=" "))
min_sleep$yr_m_d_hms<-ymd_hms(min_sleep$yr_m_d_hms)
class(min_sleep$yr_m_d_hms)
## [1] "POSIXct" "POSIXt"
#finally, now plot again
#convert Id to factor
min_sleep$Id<-as.factor(min_sleep$Id)
ggplot(min_sleep, aes(yr_m_d_hms,value))+
geom_point(aes(color = Id,
shape = as.character(value)))+
facet_wrap(~Id)

#plotting the most complete one
min_sleep %>%
filter(Id == "8378563200") %>%
ggplot(aes(yr_m_d_hms,value))+
geom_point(aes(color=value))+
theme_bw()

#take away most are not using their fitbit for sleep tracking
#completely or fitbit is not working properly
Daily Steps Data
hr_steps<-read.csv("C:\\Users\\deyvi\\OneDrive\\Desktop\\R files\\mturkfitbit_export_3.12.16-4.11.16\\Fitabase Data 3.12.16-4.11.16\\hourlySteps_merged.csv")
n_distinct(hr_steps$Id) #34, but only 30 Ids at max are expected
## [1] 34
#clean date-time
#do a geom_step plot
hr_steps<-hr_steps %>%
separate(ActivityHour, c("day", "time", "meridiem"),sep= " ")
hr_steps<-hr_steps %>%
separate(day, c("month","day", "year", sep="/"))
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 24084 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
class(hr_steps$month)
## [1] "character"
hr_steps$month<-as.factor(hr_steps$month)
levels(hr_steps$month)#only two months okay..
## [1] "3" "4"
# 3 and 4 numericals (factors)
hr_steps$month<-as.character(hr_steps$month)
hr_steps$month[hr_steps$month == "3"]<-"03"
hr_steps$month[hr_steps$month == "4"]<-"04"
hr_steps<-hr_steps %>%
select(-`/`)
hr_steps<-hr_steps %>%
mutate(ymd = paste(hr_steps$year,
hr_steps$month,hr_steps$day, sep = "-"))
hr_steps$ymd<-as.Date(hr_steps$ymd)
class(hr_steps$ymd)
## [1] "Date"
class(hr_steps$time)
## [1] "character"
hr_steps<-hr_steps %>%
separate(time, c("hours", "minutes", "seconds", sep=":"))
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 24084 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
hr_steps<-hr_steps %>%
select(-`:`)
hr_steps$hours<-as.factor(hr_steps$hours)
levels(hr_steps$hours)
## [1] "1" "10" "11" "12" "2" "3" "4" "5" "6" "7" "8" "9"
hr_steps$hours<-as.character(hr_steps$hours)
hr_steps$hours[hr_steps$hours == "1"]<-"01"
hr_steps$hours[hr_steps$hours == "2"]<-"02"
hr_steps$hours[hr_steps$hours == "3"]<-"03"
hr_steps$hours[hr_steps$hours == "4"]<-"04"
hr_steps$hours[hr_steps$hours == "5"]<-"05"
hr_steps$hours[hr_steps$hours == "6"]<-"06"
hr_steps$hours[hr_steps$hours == "7"]<-"07"
hr_steps$hours[hr_steps$hours == "8"]<-"08"
hr_steps$hours[hr_steps$hours == "9"]<-"09"
hr_steps<-hr_steps %>%
mutate(hms = paste(hr_steps$hours,hr_steps$minutes,
hr_steps$seconds, sep = ":"))
hr_steps<-hr_steps %>%
mutate(yr_m_d_hms = paste(hr_steps$ymd,hr_steps$hms, sep=" "))
hr_steps$yr_m_d_hms<-ymd_hms(hr_steps$yr_m_d_hms)
class(hr_steps$yr_m_d_hms)
## [1] "POSIXct" "POSIXt"
hr_steps$Id<-as.factor(hr_steps$Id)
ggplot(hr_steps, aes(yr_m_d_hms,StepTotal))+
geom_step(aes(color=Id))+
theme_classic()+
facet_wrap(~Id)

#8877689391 seems to be the most active
hr_steps %>%
filter(Id == "8583815059") %>%
ggplot(aes(yr_m_d_hms,StepTotal))+
geom_step(color="pink")+
theme_bw()+
ggtitle("Id = 8583815059")

hr_steps %>%
filter(Id == "8583815059") %>%
filter(month == "04") %>%
ggplot(aes(yr_m_d_hms,StepTotal))+
geom_step(color="pink3")+
theme_bw()+
ggtitle("Id = 8583815059 April only")

hr_steps %>%
filter(Id == "8583815059") %>%
filter(month == "04") %>%
ggplot(aes(yr_m_d_hms,StepTotal))+
geom_point(color="salmon")+
theme_bw()+
ggtitle("Id = 8583815059 April only")

#okay from the scatter we know this person
#doesn't use the fitbit after mid-day
Session Info
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.10.4 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [5] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [9] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.9 utf8_1.2.4 generics_0.1.3 lattice_0.22-6
## [5] stringi_1.8.4 hms_1.1.3 digest_0.6.35 magrittr_2.0.3
## [9] evaluate_0.23 grid_4.4.0 timechange_0.3.0 fastmap_1.1.1
## [13] Matrix_1.7-0 jsonlite_1.8.8 mgcv_1.9-1 httr_1.4.7
## [17] fansi_1.0.6 crosstalk_1.2.1 viridisLite_0.4.2 scales_1.3.0
## [21] lazyeval_0.2.2 jquerylib_0.1.4 cli_3.6.2 crayon_1.5.2
## [25] rlang_1.1.3 bit64_4.0.5 splines_4.4.0 munsell_0.5.1
## [29] withr_3.0.0 cachem_1.0.8 yaml_2.3.8 parallel_4.4.0
## [33] tools_4.4.0 tzdb_0.4.0 colorspace_2.1-0 vctrs_0.6.5
## [37] R6_2.5.1 lifecycle_1.0.4 bit_4.0.5 htmlwidgets_1.6.4
## [41] vroom_1.6.5 pkgconfig_2.0.3 pillar_1.9.0 bslib_0.7.0
## [45] gtable_0.3.5 glue_1.7.0 data.table_1.15.4 xfun_0.43
## [49] tidyselect_1.2.1 highr_0.10 rstudioapi_0.16.0 knitr_1.46
## [53] farver_2.1.2 nlme_3.1-164 htmltools_0.5.8.1 rmarkdown_2.26
## [57] labeling_0.4.3 compiler_4.4.0