About the Project and Dataset

This is a detailed analysis of spreadsheet data for a fitness smartwatch that tracks data including: calories, hours slept, intensity levels, and overall data activity. The original dataset can be found online on Kaggle, which is modified and cleaned to my profile at Fitbit Project Portfolio.

In this analysis, I proceed to clean the data and aggregate datetime columns from the various spreadsheets to ensure each of the csv files (dailyActivity, dailySleep, dailyCalories, dailyIntensities, dailySteps) have uniform units of per day. Additionally, I create a column that combines the activity day and Id to create a unique identifier (there are multiple observations per Id - see hourlyIntensities. After cleaning the data, I use R code to merge and join the CSV files to begin looking for trends and patterns (joining could have been done in SQL, but the number of observations is relatively sparse).

library(ggplot2)
library(ggridges)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(dplyr)
library(viridis)
## Loading required package: viridisLite
library(SmartEDA)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:base':
## 
##     transform
library(DataExplorer)
library(moments)
## 
## Attaching package: 'moments'
## 
## The following objects are masked from 'package:dlookr':
## 
##     kurtosis, skewness
library(flextable)
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(forcats)
library(corrplot)
## corrplot 0.92 loaded
options(warn = -1)

# -- Load and Clean Data -- #
f1 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Fitabase Data 3.12.16-4.11.16/dailyActivity_merged.csv"
f2 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Fitabase Data 3.12.16-4.11.16/dailySleep_merged.csv"
f3 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Fitabase Data 3.12.16-4.11.16/dailyCalories_merged.csv"
f4 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Fitabase Data 3.12.16-4.11.16/dailyIntensities_merged.csv"
f5 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Fitabase Data 3.12.16-4.11.16/dailySteps_merged.csv"


df_daily_act <- read_csv(f1)
## Rows: 457 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (15): ID-AD, Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActiv...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_daily_sleep <- read_csv(f2)
## Rows: 198559 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (4): ID-AD, Id, logId, hours
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_daily_cal <- read_csv(f3)
## Rows: 1021 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ID-AD, date
## dbl (2): Id, Calories
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_daily_int <- read_csv(f4)
## Rows: 1021 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (4): ID-AD, Id, TotalIntensity, AverageIntensity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_daily_steps <- read_csv(f5)
## Rows: 1021 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (3): ID-AD, Id, StepTotal
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# -- Combine data using join or merge functions -- #
# Remark: joining on the unique identifier, ID-AD, obtained by combining Id with date

df_tot <- df_daily_act %>% 
   dplyr::inner_join(df_daily_sleep, by = "ID-AD") %>% select(-Id.x, -Id.y) %>% 
     dplyr::inner_join(df_daily_int, by = "ID-AD") %>% select(-date.x, -date.y, -TrackerDistance) 

colnames(df_tot)[16] <- "HoursSlept"
colnames(df_tot)[11] <- "ModeratelyActiveMinutes"
colnames(df_tot)[8] <- "LightlyActiveDistance"

Observations after Modification

After joining the data using dplyr::inner_join(), we change a few of the column names and observe most of the columns are numeric types. Before creating visualizations, we will generate a few preliminary reports using functions from libraries such as DataExplorer, SmartEDA, and dlookr.

It would be beneficial to create categorical variables based on the activity levels of the people using the fitbits. Since we are interested in the number of Calories burned (to market the product in the wellness and fitness industry), we can categorize activity levels hierarchically such as Sedentary, Lightly Active, Moderately Active, and Very Active using binning.

## Rows: 228
## Columns: 16
## $ ActivityDate             <chr> "3/25/2016", "3/26/2016", "3/26/2016", "3/27/…
## $ TotalSteps               <dbl> 11004, 17609, 17609, 12736, 12736, 13231, 120…
## $ TotalDistance            <dbl> 7.11, 11.55, 11.55, 8.53, 8.53, 8.93, 7.85, 7…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance       <dbl> 2.57, 6.92, 6.92, 4.66, 4.66, 3.19, 2.16, 2.2…
## $ ModeratelyActiveDistance <dbl> 0.46, 0.73, 0.73, 0.16, 0.16, 0.79, 1.09, 0.4…
## $ LightlyActiveDistance    <dbl> 4.07, 3.91, 3.91, 3.71, 3.71, 4.95, 4.61, 5.0…
## $ SedentaryActiveDistance  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes        <dbl> 33, 89, 89, 56, 56, 39, 28, 33, 47, 40, 15, 4…
## $ ModeratelyActiveMinutes  <dbl> 12, 17, 17, 5, 5, 20, 28, 12, 21, 11, 30, 18,…
## $ LightlyActiveMinutes     <dbl> 205, 274, 274, 268, 268, 224, 243, 239, 200, …
## $ SedentaryMinutes         <dbl> 804, 588, 588, 605, 605, 1080, 763, 820, 866,…
## $ Calories                 <dbl> 1819, 2154, 2154, 1944, 1944, 1932, 1886, 188…
## $ HoursSlept               <dbl> 6.42, 7.60, 0.23, 2.13, 5.48, 1.27, 6.28, 5.5…
## $ TotalIntensity           <dbl> 139, 130, 130, 97, 97, 76, 56, 61, 52, 129, 9…
## $ AverageIntensity         <dbl> 0.070, 0.066, 0.066, 0.049, 0.049, 0.038, 0.0…

ActivityDate

TotalSteps

TotalDistance

LoggedActivitiesDistance

VeryActiveDistance

ModeratelyActiveDistance

LightlyActiveDistance

SedentaryActiveDistance

VeryActiveMinutes

ModeratelyActiveMinutes

LightlyActiveMinutes

SedentaryMinutes

Calories

HoursSlept

TotalIntensity

AverageIntensity

3/25/2016

11,004

7.11

0.000000

2.57

0.46

4.07

0.00

33

12

205

804

1,819

6.42

139

0.070

3/26/2016

17,609

11.55

0.000000

6.92

0.73

3.91

0.00

89

17

274

588

2,154

7.60

130

0.066

3/26/2016

17,609

11.55

0.000000

6.92

0.73

3.91

0.00

89

17

274

588

2,154

0.23

130

0.066

3/27/2016

12,736

8.53

0.000000

4.66

0.16

3.71

0.00

56

5

268

605

1,944

2.13

97

0.049

3/27/2016

12,736

8.53

0.000000

4.66

0.16

3.71

0.00

56

5

268

605

1,944

5.48

97

0.049

3/28/2016

13,231

8.93

0.000000

3.19

0.79

4.95

0.00

39

20

224

1,080

1,932

1.27

76

0.038

3/29/2016

12,041

7.85

0.000000

2.16

1.09

4.61

0.00

28

28

243

763

1,886

6.28

56

0.028

3/31/2016

12,256

7.86

0.000000

2.29

0.49

5.04

0.00

33

12

239

820

1,889

5.58

61

0.031

4/1/2016

12,262

7.87

0.000000

3.32

0.83

3.64

0.00

47

21

200

866

1,868

5.08

52

0.026

4/2/2016

11,248

7.25

0.000000

3.00

0.45

3.74

0.00

40

11

244

636

1,843

8.47

129

0.065

4/3/2016

10,016

6.37

0.000000

0.91

1.28

4.18

0.00

15

30

314

655

1,850

7.08

96

0.050

4/4/2016

14,557

9.80

0.000000

3.39

0.70

5.69

0.00

43

18

285

757

2,030

5.60

69

0.036

4/5/2016

14,844

9.73

0.000000

2.94

0.76

6.04

0.00

36

18

341

736

2,083

5.13

46

0.023

4/9/2016

12,432

8.10

0.000000

2.59

0.59

4.92

0.00

32

15

248

738

1,883

6.77

45

0.025

4/10/2016

10,057

6.98

0.000000

4.00

0.49

2.48

0.00

44

13

168

737

1,755

7.95

120

0.074

4/11/2016

10,990

7.26

0.000000

2.04

0.57

4.65

0.00

26

14

216

855

1,811

5.47

67

0.045

4/3/2016

2,841

1.88

0.000000

0.00

0.00

1.88

0.00

0

0

136

1,114

1,636

3.15

96

0.050

4/9/2016

4,979

3.29

0.000000

0.00

0.00

3.29

0.00

0

0

184

620

1,807

10.58

45

0.025

4/2/2016

5,662

3.92

0.000000

0.00

0.00

3.92

0.00

0

0

267

858

2,783

5.23

129

0.065

4/3/2016

3,198

2.21

0.000000

0.00

0.00

2.21

0.00

0

0

146

1,183

2,449

1.83

96

0.050

4/4/2016

2,352

1.63

0.000000

0.00

0.00

1.63

0.00

0

0

128

829

2,380

3.13

69

0.036

4/5/2016

2,234

1.55

0.000000

0.00

0.00

1.55

0.00

0

0

108

912

2,344

6.98

46

0.023

4/6/2016

1,259

0.87

0.000000

0.00

0.00

0.87

0.00

0

0

73

938

2,202

7.13

122

0.064

4/9/2016

2,523

1.75

0.000000

0.00

0.37

1.38

0.00

0

9

134

1,250

2,443

0.77

45

0.025

4/10/2016

2,105

1.46

0.000000

0.00

0.00

1.46

0.00

0

0

139

805

2,442

5.13

120

0.074

4/10/2016

2,105

1.46

0.000000

0.00

0.00

1.46

0.00

0

0

139

805

2,442

1.03

120

0.074

4/11/2016

1,209

0.84

0.000000

0.00

0.00

0.84

0.00

0

0

73

842

2,255

2.05

67

0.045

4/12/2016

24

0.02

0.000000

0.00

0.00

0.02

0.00

0

0

3

161

942

2.13

25

0.021

4/1/2016

2,605

1.62

0.000000

0.00

0.00

1.62

0.00

0

0

166

729

1,407

2.45

52

0.026

4/2/2016

1,229

0.76

0.000000

0.00

0.00

0.76

0.00

0

0

69

795

1,237

2.55

129

0.065

4/3/2016

2,308

1.43

0.000000

0.00

0.00

1.43

0.00

0

0

115

720

1,330

2.97

96

0.050

4/4/2016

6,679

4.14

0.000000

0.00

0.00

4.14

0.00

0

0

263

644

1,583

2.00

69

0.036

4/5/2016

5,645

3.50

0.000000

0.00

0.00

3.50

0.00

0

0

248

621

1,538

2.57

46

0.023

4/6/2016

3,180

1.97

0.000000

0.00

0.00

1.97

0.00

0

0

185

738

1,421

2.17

122

0.064

4/7/2016

2,767

1.72

0.000000

0.00

0.00

1.72

0.00

0

0

170

673

1,392

2.95

80

0.042

4/8/2016

3,762

2.33

0.000000

0.00

0.00

2.33

0.00

0

0

173

719

1,425

2.92

45

0.023

4/9/2016

3,098

1.92

0.000000

0.00

0.00

1.92

0.00

0

0

185

700

1,413

2.82

45

0.025

4/10/2016

5,142

3.19

0.000000

0.00

0.00

3.19

0.00

0

0

230

654

1,515

2.72

120

0.074

4/11/2016

3,279

2.03

0.000000

0.00

0.00

2.03

0.00

0

0

164

694

1,405

3.18

67

0.045

3/29/2016

10,272

6.79

0.000000

0.16

3.12

3.50

0.00

2

58

208

700

2,041

1.82

56

0.028

3/30/2016

10,533

7.10

0.000000

1.77

2.06

3.27

0.00

21

35

255

615

2,187

1.57

99

0.050

3/31/2016

6,760

4.47

0.000000

0.00

0.00

4.47

0.00

0

0

250

613

1,929

2.28

61

0.031

4/2/2016

15,459

10.22

0.000000

3.59

0.81

5.82

0.00

51

16

327

583

2,438

7.63

129

0.065

4/2/2016

15,459

10.22

0.000000

3.59

0.81

5.82

0.00

51

16

327

583

2,438

0.05

129

0.065

4/3/2016

7,485

4.95

0.000000

0.00

0.00

4.95

0.00

0

0

324

491

2,035

2.50

96

0.050

4/4/2016

10,254

6.80

0.000000

1.42

1.23

4.16

0.00

21

25

231

638

2,099

2.08

69

0.036

4/5/2016

10,114

6.82

0.000000

1.64

0.48

4.68

0.00

18

9

269

696

2,096

0.62

46

0.023

4/7/2016

10,320

6.85

0.000000

0.68

1.23

4.94

0.00

15

32

315

1,002

2,338

1.25

80

0.042

4/9/2016

16,081

10.63

0.000000

1.25

1.82

7.56

0.00

16

32

401

970

2,488

0.33

45

0.025

4/10/2016

10,078

6.83

0.000000

1.02

0.12

5.69

0.00

12

3

303

463

2,164

1.93

120

0.074

4/1/2016

4,499

3.01

0.000000

0.00

0.00

3.01

0.00

0

0

168

842

1,288

7.15

52

0.026

4/2/2016

7,618

5.10

0.000000

0.00

0.18

4.91

0.00

0

6

302

842

1,490

4.82

129

0.065

4/3/2016

11,508

7.70

0.000000

2.17

0.84

4.69

0.00

29

22

282

748

1,630

5.97

96

0.050

4/4/2016

11,943

7.99

0.000000

1.03

2.01

4.95

0.00

16

43

297

710

1,648

6.22

69

0.036

4/5/2016

12,303

8.23

0.000000

1.71

1.35

5.17

0.00

26

32

291

605

1,649

8.08

46

0.023

4/6/2016

15,425

10.32

0.000000

1.62

1.82

6.87

0.00

24

45

331

614

1,783

7.08

122

0.064

4/7/2016

8,422

5.63

0.000000

0.00

3.60

2.04

0.00

0

81

139

815

1,431

6.73

80

0.042

4/8/2016

10,226

6.84

0.000000

0.00

4.44

2.40

0.00

0

101

174

685

1,524

7.98

45

0.023

4/9/2016

14,583

9.76

0.000000

2.48

1.74

5.53

0.00

35

36

254

697

1,697

6.95

45

0.025

4/10/2016

3,573

2.39

0.000000

0.94

0.41

0.97

0.00

15

8

99

866

1,232

7.52

120

0.074

4/11/2016

3,108

2.14

0.000000

0.07

0.04

1.96

0.00

1

1

129

894

1,223

6.90

67

0.045

3/12/2016

5,543

3.97

0.000000

0.00

0.00

3.96

0.01

0

0

254

757

2,990

6.00

98

0.049

3/12/2016

5,543

3.97

0.000000

0.00

0.00

3.96

0.01

0

0

254

757

2,990

1.12

98

0.049

3/13/2016

3,226

2.31

0.000000

0.00

0.00

2.28

0.00

0

0

136

771

2,480

5.37

53

0.027

3/14/2016

3,023

2.17

0.000000

0.00

0.00

2.14

0.00

0

0

145

1,005

2,570

4.82

27

0.014

3/15/2016

5,906

4.23

0.000000

0.00

0.16

4.04

0.00

0

10

215

874

3,016

5.67

53

0.027

3/16/2016

12,483

8.99

0.000000

1.45

0.57

6.90

0.00

25

14

309

599

3,830

8.20

73

0.037

3/17/2016

8,940

6.41

0.000000

0.00

0.00

0.61

0.04

0

0

47

986

3,706

6.77

70

0.035

3/30/2016

5,400

3.87

0.000000

0.00

0.00

3.85

0.00

0

0

258

795

3,418

2.43

99

0.050

3/30/2016

5,400

3.87

0.000000

0.00

0.00

3.85

0.00

0

0

258

795

3,418

3.98

99

0.050

3/31/2016

7,428

5.33

0.000000

0.87

0.92

3.51

0.00

18

33

261

860

3,439

4.45

61

0.031

4/1/2016

5,351

3.84

0.000000

0.62

0.18

3.01

0.00

15

17

210

748

3,338

4.08

52

0.026

4/1/2016

5,351

3.84

0.000000

0.62

0.18

3.01

0.00

15

17

210

748

3,338

2.92

52

0.026

4/1/2016

5,351

3.84

0.000000

0.62

0.18

3.01

0.00

15

17

210

748

3,338

0.45

52

0.026

4/2/2016

4,299

3.10

0.000000

0.09

0.21

2.74

0.04

3

15

162

752

2,892

3.12

129

0.065

4/3/2016

6,107

4.38

0.000000

0.00

0.00

4.36

0.00

0

0

265

776

3,313

5.00

96

0.050

4/3/2016

6,107

4.38

0.000000

0.00

0.00

4.36

0.00

0

0

265

776

3,313

1.62

96

0.050

4/4/2016

6,429

4.60

0.000000

0.00

0.95

3.65

0.00

0

28

193

741

3,118

7.95

69

0.036

4/6/2016

7,476

5.36

0.000000

0.00

0.00

5.31

0.00

0

0

263

920

3,253

4.27

122

0.064

4/10/2016

5,129

3.68

0.000000

0.18

0.24

3.21

0.01

5

7

176

1,178

2,817

1.22

120

0.074

4/11/2016

2,993

2.15

0.000000

0.00

0.00

2.09

0.00

0

0

114

888

2,507

3.75

67

0.045

3/29/2016

5,643

3.79

0.000000

0.23

0.33

3.23

0.00

3

8

199

683

1,958

1.60

56

0.028

3/29/2016

5,643

3.79

0.000000

0.23

0.33

3.23

0.00

3

8

199

683

1,958

1.73

56

0.028

3/31/2016

8,144

5.46

0.000000

0.25

0.61

4.61

0.00

4

17

247

1,125

2,129

0.77

61

0.031

4/1/2016

9,343

6.27

0.000000

0.96

0.85

4.46

0.00

14

20

268

654

2,216

1.55

52

0.026

4/1/2016

9,343

6.27

0.000000

0.96

0.85

4.46

0.00

14

20

268

654

2,216

0.72

52

0.026

4/2/2016

8,405

5.66

0.000000

1.13

0.19

4.34

0.00

15

4

257

621

2,154

0.97

129

0.065

4/3/2016

8,223

5.52

0.000000

0.80

1.51

3.21

0.00

12

37

257

551

2,178

1.03

96

0.050

4/4/2016

10,067

6.76

0.000000

0.36

1.42

4.97

0.00

5

39

313

578

2,308

1.52

69

0.036

4/5/2016

8,359

5.63

0.000000

0.23

0.53

4.87

0.00

3

14

311

638

2,201

1.15

46

0.023

4/6/2016

10,946

7.35

0.000000

0.54

2.09

4.72

0.00

8

51

308

569

2,341

4.30

122

0.064

4/6/2016

10,946

7.35

0.000000

0.54

2.09

4.72

0.00

8

51

308

569

2,341

0.80

122

0.064

3/29/2016

2,303

1.55

0.000000

0.00

0.00

1.55

0.00

0

0

155

807

2,010

3.75

56

0.028

3/29/2016

2,303

1.55

0.000000

0.00

0.00

1.55

0.00

0

0

155

807

2,010

4.18

56

0.028

3/31/2016

4,804

3.22

0.000000

0.00

0.00

3.22

0.00

0

0

238

786

2,227

4.90

61

0.031

3/31/2016

4,804

3.22

0.000000

0.00

0.00

3.22

0.00

0

0

238

786

2,227

2.00

61

0.031

4/1/2016

3,271

2.19

0.000000

0.00

0.00

2.19

0.00

0

0

205

898

2,133

5.60

52

0.026

4/2/2016

5,406

3.63

0.000000

0.00

0.00

3.61

0.00

0

0

273

672

2,317

8.23

129

0.065

4/5/2016

4,239

2.84

0.000000

0.06

0.24

2.53

0.00

1

6

235

702

2,217

8.25

46

0.023

4/6/2016

6,911

4.78

0.000000

2.32

0.10

2.36

0.00

28

2

215

745

2,374

7.48

122

0.064

4/7/2016

6,667

4.61

0.000000

2.02

0.05

2.55

0.00

24

1

208

939

2,335

4.45

80

0.042

4/8/2016

5,543

3.72

0.000000

0.00

0.00

3.72

0.00

0

0

266

941

2,303

1.68

45

0.023

4/8/2016

5,543

3.72

0.000000

0.00

0.00

3.72

0.00

0

0

266

941

2,303

2.17

45

0.023

4/9/2016

4,195

2.81

0.000000

0.00

0.00

2.81

0.00

0

0

239

758

2,210

3.38

45

0.025

4/9/2016

4,195

2.81

0.000000

0.00

0.00

2.81

0.00

0

0

239

758

2,210

1.15

45

0.025

4/9/2016

4,195

2.81

0.000000

0.00

0.00

2.81

0.00

0

0

239

758

2,210

2.80

45

0.025

4/10/2016

6,625

4.45

0.000000

0.00

0.00

4.45

0.00

0

0

316

755

2,424

4.87

120

0.074

4/10/2016

6,625

4.45

0.000000

0.00

0.00

4.45

0.00

0

0

316

755

2,424

1.25

120

0.074

4/11/2016

5,986

4.13

0.000000

1.95

0.23

1.95

0.00

23

5

195

789

2,297

4.55

67

0.045

4/11/2016

5,986

4.13

0.000000

1.95

0.23

1.95

0.00

23

5

195

789

2,297

2.55

67

0.045

4/12/2016

278

0.19

0.000000

0.00

0.00

0.19

0.00

0

0

20

253

745

5.52

25

0.021

4/2/2016

3,183

2.10

0.000000

0.00

0.00

2.10

0.00

0

0

173

1,180

1,680

1.43

129

0.065

3/29/2016

6,506

5.28

0.000000

0.33

0.27

4.67

0.00

4

5

241

760

2,886

1.15

56

0.028

3/30/2016

7,583

6.15

0.000000

0.25

0.55

5.35

0.00

3

13

227

742

2,915

1.53

99

0.050

3/31/2016

6,963

5.65

0.000000

0.00

0.00

5.65

0.00

0

0

256

759

2,895

1.00

61

0.031

4/1/2016

7,165

5.81

0.000000

0.00

0.37

5.44

0.00

0

9

248

708

2,923

2.00

52

0.026

4/2/2016

10,328

8.38

0.000000

0.00

1.46

6.92

0.00

0

32

367

475

3,323

2.00

129

0.065

4/3/2016

12,116

9.83

0.000000

0.25

2.85

6.72

0.00

3

57

323

471

3,357

1.95

96

0.050

4/4/2016

7,810

6.36

0.000000

0.48

1.20

4.68

0.00

6

27

216

746

2,931

1.33

69

0.036

4/5/2016

6,670

5.41

0.000000

1.24

0.80

3.37

0.00

15

16

199

770

2,848

1.32

46

0.023

4/6/2016

7,605

6.18

0.000000

0.33

0.21

5.63

0.00

4

4

248

708

2,943

1.75

122

0.064

4/7/2016

6,585

5.34

0.000000

0.00

0.00

5.34

0.00

0

0

210

770

2,822

1.63

80

0.042

4/9/2016

14,002

11.36

0.000000

0.61

2.68

8.06

0.00

8

56

381

407

3,597

7.20

45

0.025

4/9/2016

14,002

11.36

0.000000

0.61

2.68

8.06

0.00

8

56

381

407

3,597

2.57

45

0.025

4/10/2016

11,135

9.03

0.000000

0.08

1.16

7.79

0.00

1

25

331

528

3,224

2.02

120

0.074

4/10/2016

11,135

9.03

0.000000

0.08

1.16

7.79

0.00

1

25

331

528

3,224

0.87

120

0.074

4/11/2016

6,499

5.27

0.000000

0.00

0.00

5.27

0.00

0

0

207

809

2,677

3.80

67

0.045

4/2/2016

2,937

1.92

0.000000

0.00

0.00

1.91

0.00

0

0

181

356

1,615

11.58

129

0.065

4/2/2016

2,937

1.92

0.000000

0.00

0.00

1.91

0.00

0

0

181

356

1,615

3.43

129

0.065

4/3/2016

1,515

0.99

0.000000

0.00

0.00

0.99

0.00

0

0

93

507

1,481

12.50

96

0.050

4/3/2016

1,515

0.99

0.000000

0.00

0.00

0.99

0.00

0

0

93

507

1,481

1.47

96

0.050

4/4/2016

8,921

5.88

0.000000

2.07

0.44

3.36

0.00

27

8

198

754

1,892

1.53

69

0.036

4/5/2016

11,306

7.38

0.000000

2.31

0.98

4.09

0.00

40

26

218

772

2,086

0.38

46

0.023

4/6/2016

12,252

8.00

0.000000

2.41

1.70

3.89

0.00

33

28

229

745

2,044

0.58

122

0.064

4/7/2016

15,444

10.08

0.000000

3.27

1.73

5.08

0.00

51

39

243

731

2,249

0.08

80

0.042

4/9/2016

4,599

3.00

0.000000

0.00

0.00

3.00

0.00

0

0

176

578

1,692

9.12

45

0.025

4/9/2016

4,599

3.00

0.000000

0.00

0.00

3.00

0.00

0

0

176

578

1,692

2.28

45

0.025

4/10/2016

5,594

3.65

0.000000

0.28

1.48

1.89

0.00

4

25

150

552

1,712

10.35

120

0.074

4/10/2016

5,594

3.65

0.000000

0.28

1.48

1.89

0.00

4

25

150

552

1,712

1.43

120

0.074

4/11/2016

11,906

7.77

0.000000

2.26

1.52

3.99

0.00

32

27

234

688

2,065

1.65

67

0.045

4/1/2016

10,461

7.87

0.000000

4.76

0.85

2.25

0.00

99

31

142

721

3,625

2.20

52

0.026

4/2/2016

14,873

11.11

0.000000

8.19

0.60

2.31

0.00

202

36

153

663

4,430

5.17

129

0.065

4/2/2016

14,873

11.11

0.000000

8.19

0.60

2.31

0.00

202

36

153

663

4,430

1.17

129

0.065

4/3/2016

9,917

7.41

0.000000

1.99

0.97

4.44

0.00

58

27

240

700

3,427

2.20

96

0.050

4/4/2016

7,401

5.56

0.000000

2.32

0.36

2.88

0.00

70

19

204

689

3,492

2.27

69

0.036

4/5/2016

8,964

6.70

0.000000

3.97

0.19

2.53

0.00

107

18

145

756

3,597

2.17

46

0.023

4/5/2016

8,964

6.70

0.000000

3.97

0.19

2.53

0.00

107

18

145

756

3,597

0.95

46

0.023

4/6/2016

11,080

8.30

0.000000

3.13

1.37

3.81

0.00

83

52

190

695

3,765

1.60

122

0.064

4/7/2016

4,499

3.36

0.000000

0.89

0.26

2.21

0.00

31

11

146

756

2,775

2.35

80

0.042

4/8/2016

4,363

3.26

0.000000

0.13

0.06

3.08

0.00

8

2

156

813

2,486

2.22

45

0.023

4/9/2016

10,494

7.84

0.000000

4.91

0.78

2.15

0.00

123

62

138

601

3,817

4.43

45

0.025

4/9/2016

10,494

7.84

0.000000

4.91

0.78

2.15

0.00

123

62

138

601

3,817

1.88

45

0.025

4/9/2016

10,494

7.84

0.000000

4.91

0.78

2.15

0.00

123

62

138

601

3,817

0.28

45

0.025

4/10/2016

9,776

7.38

0.000000

2.93

1.43

3.02

0.00

80

35

162

721

3,378

1.53

120

0.074

4/1/2016

7,444

5.62

0.000000

0.00

0.00

5.62

0.00

0

0

286

568

2,210

4.28

52

0.026

4/1/2016

7,444

5.62

0.000000

0.00

0.00

5.62

0.00

0

0

286

568

2,210

3.93

52

0.026

4/1/2016

7,444

5.62

0.000000

0.00

0.00

5.62

0.00

0

0

286

568

2,210

1.50

52

0.026

4/5/2016

9,910

7.48

0.000000

0.00

0.00

7.48

0.00

0

0

384

990

2,445

1.05

46

0.023

4/5/2016

9,910

7.48

0.000000

0.00

0.00

7.48

0.00

0

0

384

990

2,445

0.02

46

0.023

4/6/2016

12,409

9.37

0.000000

0.00

0.00

9.37

0.00

0

0

491

388

2,694

1.72

122

0.064

4/9/2016

10,789

8.15

0.000000

0.00

0.00

8.15

0.00

0

0

506

413

2,617

3.73

45

0.025

4/9/2016

10,789

8.15

0.000000

0.00

0.00

8.15

0.00

0

0

506

413

2,617

2.95

45

0.025

4/9/2016

10,789

8.15

0.000000

0.00

0.00

8.15

0.00

0

0

506

413

2,617

1.05

45

0.025

4/1/2016

7,225

5.18

0.000000

1.73

1.27

2.18

0.00

25

50

163

1,189

3,065

0.20

52

0.026

4/6/2016

11,761

8.43

0.000000

1.31

2.44

4.68

0.00

24

99

300

550

3,920

7.58

122

0.064

4/6/2016

11,761

8.43

0.000000

1.31

2.44

4.68

0.00

24

99

300

550

3,920

0.17

122

0.064

4/7/2016

13,987

10.03

0.000000

2.87

3.34

3.82

0.00

46

114

250

739

3,856

1.82

80

0.042

4/7/2016

13,987

10.03

0.000000

2.87

3.34

3.82

0.00

46

114

250

739

3,856

2.13

80

0.042

3/30/2016

15,491

10.24

0.000000

1.29

4.49

4.46

0.00

18

77

272

641

2,244

6.75

99

0.050

3/30/2016

15,491

10.24

0.000000

1.29

4.49

4.46

0.00

18

77

272

641

2,244

0.42

99

0.050

3/31/2016

14,097

9.32

0.000000

3.50

1.92

3.90

0.00

50

32

234

595

2,188

1.90

61

0.031

3/31/2016

14,097

9.32

0.000000

3.50

1.92

3.90

0.00

50

32

234

595

2,188

0.07

61

0.031

4/2/2016

12,437

8.30

0.000000

3.76

1.29

3.24

0.00

50

24

219

732

2,115

6.90

129

0.065

4/3/2016

12,307

8.14

0.000000

1.01

3.26

3.87

0.00

14

51

218

589

2,055

8.25

96

0.050

4/3/2016

12,307

8.14

0.000000

1.01

3.26

3.87

0.00

14

51

218

589

2,055

1.18

96

0.050

4/5/2016

12,010

7.94

0.000000

2.38

0.95

4.61

0.00

38

17

287

692

2,158

5.90

46

0.023

4/5/2016

12,010

7.94

0.000000

2.38

0.95

4.61

0.00

38

17

287

692

2,158

0.83

46

0.023

4/6/2016

12,480

8.86

3.972795

4.18

0.69

4.00

0.00

55

14

276

593

2,170

0.22

122

0.064

4/7/2016

13,417

9.65

1.926302

3.39

1.25

4.91

0.10

45

16

284

665

2,244

0.33

80

0.042

4/9/2016

12,495

8.26

0.000000

2.21

1.09

4.96

0.00

33

19

324

609

2,231

6.18

45

0.025

4/9/2016

12,495

8.26

0.000000

2.21

1.09

4.96

0.00

33

19

324

609

2,231

1.37

45

0.025

4/10/2016

10,148

6.71

0.000000

1.36

0.22

5.13

0.00

19

7

306

679

2,100

7.02

120

0.074

4/10/2016

10,148

6.71

0.000000

1.36

0.22

5.13

0.00

19

7

306

679

2,100

0.10

120

0.074

4/12/2016

5,893

3.90

0.000000

2.88

0.56

0.46

0.00

43

9

48

146

917

6.43

25

0.021

4/2/2016

10,976

7.34

0.000000

0.55

0.67

6.13

0.00

9

16

271

1,075

2,575

1.13

129

0.065

4/3/2016

16,806

11.24

0.000000

6.41

1.37

3.47

0.00

104

31

230

1,002

3,086

1.20

96

0.050

4/5/2016

12,084

9.14

4.836380

4.39

0.46

4.29

0.00

50

6

284

1,030

2,676

1.15

46

0.023

4/7/2016

14,100

10.58

4.875990

5.10

1.19

4.28

0.00

61

33

265

1,015

2,840

1.08

80

0.042

4/1/2016

11,463

7.67

0.000000

2.83

1.89

2.93

0.00

38

30

154

777

2,496

0.60

52

0.026

4/5/2016

7,478

4.99

0.000000

1.80

0.53

2.61

0.03

25

33

132

1,159

2,439

1.50

46

0.023

4/6/2016

7,352

4.73

0.000000

1.41

0.78

2.53

0.00

19

14

124

814

2,233

1.08

122

0.064

4/7/2016

14,604

10.82

0.000000

7.23

1.12

2.43

0.03

76

30

127

695

2,862

1.75

80

0.042

4/9/2016

7,338

4.67

0.000000

1.08

0.13

3.46

0.00

27

5

204

717

2,519

8.10

45

0.025

4/10/2016

569

0.35

0.000000

0.00

0.00

0.34

0.00

0

0

27

1,368

1,799

0.73

120

0.074

4/11/2016

6,242

3.92

0.000000

0.74

0.23

2.93

0.00

46

22

126

741

2,543

1.80

67

0.045

4/1/2016

14,179

11.24

0.000000

6.23

1.06

3.96

0.00

73

21

195

719

3,669

2.52

52

0.026

4/1/2016

14,179

11.24

0.000000

6.23

1.06

3.96

0.00

73

21

195

719

3,669

0.30

52

0.026

4/2/2016

3,358

2.66

0.000000

0.00

0.00

2.66

0.00

0

0

144

657

2,702

3.23

129

0.065

4/2/2016

3,358

2.66

0.000000

0.00

0.00

2.66

0.00

0

0

144

657

2,702

1.30

129

0.065

4/2/2016

3,358

2.66

0.000000

0.00

0.00

2.66

0.00

0

0

144

657

2,702

2.03

129

0.065

4/3/2016

9,152

7.26

0.000000

1.99

0.82

4.45

0.00

25

13

243

600

3,304

2.05

96

0.050

4/4/2016

13,935

11.05

2.092147

4.09

0.79

6.17

0.00

105

16

220

658

4,234

3.00

69

0.036

4/5/2016

12,846

10.19

2.253081

5.00

0.75

4.44

0.00

113

15

189

686

4,128

2.10

46

0.023

4/5/2016

12,846

10.19

2.253081

5.00

0.75

4.44

0.00

113

15

189

686

4,128

0.90

46

0.023

4/6/2016

9,124

7.24

2.092147

0.00

0.25

6.99

0.00

55

6

230

728

3,798

1.25

122

0.064

4/6/2016

9,124

7.24

2.092147

0.00

0.25

6.99

0.00

55

6

230

728

3,798

1.40

122

0.064

4/7/2016

9,725

7.71

2.253081

2.41

0.53

4.77

0.00

83

11

181

670

3,839

1.52

80

0.042

4/7/2016

9,725

7.71

2.253081

2.41

0.53

4.77

0.00

83

11

181

670

3,839

1.45

80

0.042

4/7/2016

9,725

7.71

2.253081

2.41

0.53

4.77

0.00

83

11

181

670

3,839

0.87

80

0.042

4/8/2016

8,350

6.62

2.092147

1.49

0.53

4.61

0.00

72

10

167

868

3,713

1.10

45

0.023

4/9/2016

2,240

1.78

0.000000

0.00

0.00

1.78

0.00

0

0

110

843

2,606

2.95

45

0.025

4/9/2016

2,240

1.78

0.000000

0.00

0.00

1.78

0.00

0

0

110

843

2,606

1.55

45

0.025

4/10/2016

2,631

2.09

0.000000

0.00

0.00

2.09

0.00

0

0

117

727

2,624

8.17

120

0.074

4/10/2016

2,631

2.09

0.000000

0.00

0.00

2.09

0.00

0

0

117

727

2,624

1.65

120

0.074

4/11/2016

8,837

7.01

2.092147

1.54

0.88

4.59

0.00

74

17

182

811

3,775

1.67

67

0.045

4/1/2016

4,592

2.94

0.000000

0.10

0.14

2.70

0.00

4

8

176

748

2,260

5.63

52

0.026

4/1/2016

4,592

2.94

0.000000

0.10

0.14

2.70

0.00

4

8

176

748

2,260

2.73

52

0.026

4/3/2016

7,238

4.63

0.000000

0.11

0.23

4.29

0.00

2

6

279

636

2,667

8.60

96

0.050

4/4/2016

3,821

2.45

0.000000

0.00

0.22

2.23

0.00

0

9

161

675

2,229

5.58

69

0.036

4/4/2016

3,821

2.45

0.000000

0.00

0.22

2.23

0.00

0

9

161

675

2,229

4.30

69

0.036

4/5/2016

2,332

1.49

0.000000

0.01

0.18

1.30

0.00

1

10

111

904

2,100

6.88

46

0.023

4/6/2016

2,121

1.36

0.000000

0.00

0.00

1.36

0.00

0

0

122

855

2,114

7.70

122

0.064

4/7/2016

1,291

0.83

0.000000

0.00

0.00

0.83

0.00

0

0

77

888

1,961

7.90

80

0.042

4/8/2016

1,467

0.94

0.000000

0.10

0.21

0.63

0.00

2

8

71

912

1,953

2.95

45

0.023

4/8/2016

1,467

0.94

0.000000

0.10

0.21

0.63

0.00

2

8

71

912

1,953

4.47

45

0.023

4/9/2016

1,022

0.65

0.000000

0.00

0.00

0.65

0.00

0

0

63

739

1,890

4.67

45

0.025

4/9/2016

1,022

0.65

0.000000

0.00

0.00

0.65

0.00

0

0

63

739

1,890

4.60

45

0.025

4/9/2016

1,022

0.65

0.000000

0.00

0.00

0.65

0.00

0

0

63

739

1,890

1.32

45

0.025

##      ID-AD           ActivityDate         TotalSteps    TotalDistance   
##  Min.   :1.504e+14   Length:228         Min.   :   24   Min.   : 0.020  
##  1st Qu.:3.977e+14   Class :character   1st Qu.: 4499   1st Qu.: 3.000  
##  Median :4.703e+14   Mode  :character   Median : 7482   Median : 5.620  
##  Mean   :4.985e+14                      Mean   : 7977   Mean   : 5.613  
##  3rd Qu.:6.962e+14                      3rd Qu.:11262   3rd Qu.: 7.888  
##  Max.   :8.792e+14                      Max.   :17609   Max.   :11.550  
##  LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
##  Min.   :0.0000           Min.   :0.000      Min.   :0.0000          
##  1st Qu.:0.0000           1st Qu.:0.000      1st Qu.:0.0000          
##  Median :0.0000           Median :0.265      Median :0.2400          
##  Mean   :0.1638           Mean   :1.275      Mean   :0.6450          
##  3rd Qu.:0.0000           3rd Qu.:2.180      3rd Qu.:0.9275          
##  Max.   :4.8760           Max.   :8.190      Max.   :4.4900          
##  LightlyActiveDistance SedentaryActiveDistance VeryActiveMinutes
##  Min.   :0.020         Min.   :0.000000        Min.   :  0.00   
##  1st Qu.:2.245         1st Qu.:0.000000        1st Qu.:  0.00   
##  Median :3.715         Median :0.000000        Median :  4.50   
##  Mean   :3.662         Mean   :0.001184        Mean   : 22.73   
##  3rd Qu.:4.683         3rd Qu.:0.000000        3rd Qu.: 33.00   
##  Max.   :9.370         Max.   :0.100000        Max.   :202.00   
##  ModeratelyActiveMinutes LightlyActiveMinutes SedentaryMinutes    Calories   
##  Min.   :  0.00          Min.   :  3.0        Min.   : 146.0   Min.   : 745  
##  1st Qu.:  0.00          1st Qu.:155.8        1st Qu.: 632.2   1st Qu.:1958  
##  Median :  9.00          Median :217.0        Median : 727.5   Median :2260  
##  Mean   : 16.18          Mean   :216.9        Mean   : 727.4   Mean   :2462  
##  3rd Qu.: 24.25          3rd Qu.:268.0        3rd Qu.: 804.2   3rd Qu.:2893  
##  Max.   :114.00          Max.   :506.0        Max.   :1368.0   Max.   :4430  
##      logId             HoursSlept           Id            TotalIntensity  
##  Min.   :1.111e+10   Min.   : 0.020   Min.   :1.504e+09   Min.   : 25.00  
##  1st Qu.:1.129e+10   1st Qu.: 1.393   1st Qu.:3.977e+09   1st Qu.: 46.00  
##  Median :1.132e+10   Median : 2.315   Median :4.703e+09   Median : 69.00  
##  Mean   :1.132e+10   Mean   : 3.401   Mean   :4.985e+09   Mean   : 78.88  
##  3rd Qu.:1.135e+10   3rd Qu.: 5.265   3rd Qu.:6.962e+09   3rd Qu.:120.00  
##  Max.   :1.137e+10   Max.   :12.500   Max.   :8.792e+09   Max.   :139.00  
##  AverageIntensity 
##  Min.   :0.01400  
##  1st Qu.:0.02500  
##  Median :0.04200  
##  Mean   :0.04231  
##  3rd Qu.:0.06400  
##  Max.   :0.07400

described_variables

n

na

mean

sd

se_mean

IQR

skewness

kurtosis

p00

p01

p05

p10

p20

p25

p30

p40

p50

p60

p70

p75

p80

p90

p95

p99

p100

TotalIntensity

228

0

78.88

31.99

2.12

74.00

0.36

-1.36

25.00

25.54

45.00

45.00

46.00

46.00

52.00

61.00

69.00

80.00

97.00

120.00

120.00

124.10

129.00

129.73

139.00

AverageIntensity

228

0

0.04

0.02

0.00

0.04

0.45

-1.23

0.01

0.02

0.02

0.02

0.03

0.03

0.03

0.03

0.04

0.04

0.05

0.06

0.06

0.06

0.07

0.07

0.07

TotalDistance

228

0

5.61

3.03

0.20

4.89

0.10

-1.05

0.02

0.43

0.96

1.63

2.45

3.00

3.65

4.44

5.62

6.80

7.70

7.89

8.25

9.89

10.61

11.36

11.55

TotalSteps

228

0

7,976.65

4,247.19

281.28

6,763.50

0.12

-0.99

24.00

691.31

1,483.80

2,324.80

3,444.00

4,499.00

5,351.00

6,485.00

7,481.50

9,725.00

10,763.40

11,262.50

12,103.20

13,991.50

14,862.85

16,610.25

17,609.00

Calories

228

0

2,461.91

745.15

49.35

934.75

0.54

-0.24

745.00

1,017.87

1,415.80

1,615.00

1,889.40

1,958.00

2,083.30

2,201.80

2,260.00

2,445.00

2,701.20

2,892.75

3,181.60

3,669.00

3,835.85

4,205.38

4,430.00

HoursSlept

228

0

3.40

2.63

0.17

3.87

0.91

0.05

0.02

0.07

0.33

0.79

1.19

1.39

1.53

1.95

2.32

3.13

4.59

5.27

5.94

7.49

8.15

10.52

12.50

LightlyActiveDistance

228

0

3.66

1.79

0.12

2.44

0.46

0.17

0.02

0.37

0.83

1.52

2.09

2.25

2.53

3.07

3.72

4.05

4.58

4.68

4.95

5.69

6.99

8.15

9.37

LightlyActiveMinutes

228

0

216.94

85.22

5.64

112.25

0.42

1.09

3.00

32.40

73.00

116.40

145.00

155.75

170.30

195.00

217.00

239.20

258.00

268.00

280.80

314.30

331.00

501.95

506.00

SedentaryMinutes

228

0

727.36

173.73

11.51

172.00

0.31

1.97

146.00

280.81

465.80

552.00

602.60

632.25

657.00

691.40

727.50

748.00

771.90

804.25

836.80

938.30

1,024.75

1,187.38

1,368.00

VeryActiveDistance

228

0

1.27

1.78

0.12

2.18

1.65

2.48

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.27

0.97

1.73

2.18

2.41

3.82

4.91

7.15

8.19

ModeratelyActiveDistance

228

0

0.65

0.89

0.06

0.93

2.07

4.75

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.16

0.24

0.53

0.79

0.93

1.14

1.76

2.60

4.21

4.49

ModeratelyActiveMinutes

228

0

16.18

21.74

1.44

24.25

2.14

5.40

0.00

0.00

0.00

0.00

0.00

0.00

0.00

5.00

9.00

15.00

18.90

24.25

28.00

40.20

57.65

100.46

114.00

VeryActiveMinutes

228

0

22.73

34.23

2.27

33.00

2.26

6.55

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

4.50

15.20

26.00

33.00

43.00

72.30

95.50

123.00

202.00

LoggedActivitiesDistance

228

0

0.16

0.69

0.05

0.00

4.72

24.12

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

2.09

3.51

4.88

SedentaryActiveDistance

228

0

0.00

0.01

0.00

0.00

9.38

102.16

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.00

0.04

0.10

variables

min

Q1

mean

median

Q3

max

zero

minus

outlier

TotalSteps

24.00

4,499.00

7,976.65

7,481.50

11,262.50

17,609.00

0

0

0

TotalDistance

0.02

3.00

5.61

5.62

7.89

11.55

0

0

0

LoggedActivitiesDistance

0.00

0.00

0.16

0.00

0.00

4.88

214

0

14

VeryActiveDistance

0.00

0.00

1.27

0.27

2.18

8.19

94

0

8

ModeratelyActiveDistance

0.00

0.00

0.65

0.24

0.93

4.49

82

0

14

LightlyActiveDistance

0.02

2.25

3.66

3.72

4.68

9.37

0

0

1

SedentaryActiveDistance

0.00

0.00

0.00

0.00

0.00

0.10

220

0

8

VeryActiveMinutes

0.00

0.00

22.73

4.50

33.00

202.00

92

0

18

ModeratelyActiveMinutes

0.00

0.00

16.18

9.00

24.25

114.00

82

0

11

LightlyActiveMinutes

3.00

155.75

216.94

217.00

268.00

506.00

0

0

4

SedentaryMinutes

146.00

632.25

727.36

727.50

804.25

1,368.00

0

0

16

Calories

745.00

1,958.00

2,461.91

2,260.00

2,892.75

4,430.00

0

0

2

HoursSlept

0.02

1.39

3.40

2.32

5.27

12.50

0

0

2

TotalIntensity

25.00

46.00

78.88

69.00

120.00

139.00

0

0

0

AverageIntensity

0.01

0.03

0.04

0.04

0.06

0.07

0

0

0

## 
##  Anscombe-Glynn kurtosis test
## 
## data:  df_tot$TotalSteps
## kurt = 2.0065, z = -6.2249, p-value = 4.82e-10
## alternative hypothesis: kurtosis is not equal to 3
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  df_tot$Calories
## kurt = 2.73955, z = -0.72288, p-value = 0.4698
## alternative hypothesis: kurtosis is not equal to 3
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  df_tot$VeryActiveDistance
## kurt = 5.3966, z = 4.1207, p-value = 3.777e-05
## alternative hypothesis: kurtosis is not equal to 3
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  df_tot$LightlyActiveMinutes
## kurt = 4.0398, z = 2.5349, p-value = 0.01125
## alternative hypothesis: kurtosis is not equal to 3

Our reports suggest we do not have any negative data and the variables with greatest change after removing outliers include: VeryActiveMinutes, SedentaryMinutes, LoggedActiviesDistance, ModeratelyActiveDistance, and ModeratelyActiveMinutes (see: FitnessSmartEDA from SmartEDA library) with VeryActiveMinutes having the greatest change as the Kurtosis has a value of around 9.38 (using kurtosis()). Some of the data is somewhat normally distributed after making observations when testing for Kurtosis close to 3 using anscombe.test() for Calories, TotalSteps, VeryActiveDistance, and LightlyActiveDistance. Most of the data appears to be skewed right with VeryActiveMiinutes having skewness of around 2.25 (using skewness()).

We can now explore the patterns within the dataset such as how much distance should a person travel in order to burn a certain amount of calories and how is sleep related to the activity levels of a person, but first we will need to explore the heterogeneity of the distributions of the columns of data.

# -- Generate density functions 
# We are interested in the properties of Calories burned.
# The distribution appears to be bimodally distributed: 

ggplot(data = df_tot, aes(x = Calories)) + 
  geom_density(data = df_tot, aes(x = Calories)) + 
  labs(x = "Calories", title = "PDF: Calories", y = "Density") + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$SedentaryMinutes
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Sedentary Minutes", y = "Calories", title = "Box Plot: Sedentary Minutes vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$LightlyActiveMinutes
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Light Activity Minutes", y = "Calories", title = "Box Plot: Light Activity Minutes vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$ModeratelyActiveMinutes
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Moderate Activity Minutes", y = "Calories", title = "Box Plot: Moderate Activity Minutes vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$VeryActiveMinutes
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Very Activity Minutes", y = "Calories", title = "Box Plot: Very Activity Minutes vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

# Remark: SedentaryActiveDistance does not work due to the distribution properties.

cvar <- df_tot$LightlyActiveDistance
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Light Activity Distance", y = "Calories", title = "Box Plot: Light Activity Distance vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$ModeratelyActiveDistance
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Moderate Activity Distance", y = "Calories", title = "Box Plot: Moderate Activity Distance vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

cvar <- df_tot$VeryActiveDistance
quartiles <- quantile(cvar, c(0.25, 0.5, 0.75))
df_tot$qrts <- cut(cvar, breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
ggplot(data = df_tot, aes(x = df_tot$qrts, y = Calories, fill = df_tot$qrts)) + 
  geom_violin(data = df_tot, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df_tot, aes(x = df_tot$qrts), width = 0.10) + 
  labs(x = "Very Activity Distance", y = "Calories", title = "Box Plot: Very Activity Distance vs Calories", fill = "Quantiles") +
  scale_fill_manual(values = viridis(n = length(unique(df_tot$qrts)))) + 
    theme_linedraw(base_size = 18)

We observe the LightActivityMinutes plot to be bimodally distributed at the first and third quartile with a significant proportion of the highest forth quartile to have a skinny tail (violin plot in yellow) suggesting a uniform spread. In the FairlyActivityMinutes plot, we observe similar distributions for the third and forth quartiles and similar results are obtained in the VeryActiveMinutes plot for the first and second quartiles. This suggests the upper and lower tails of the variables are not similarly distributed and we can categorize the activity levels based on multiple columns. Similar results are obtained when substituting minutes for distance, but the qualitative shape of the distribution is less heterogeneous.

After exploring with various methods, I started by taking the greatest of the measures of central tendency (average and median) for each variable (VeryActiveMinutes, ModeratelyActiveMinutes, and LightlyActiveMinutes). The Very Active population will be greater than or equal to the max of average and median of VeryActiveMinutes, otherwise it would fall into the Moderately Active or Lightly Active population. To classify if it should fall into the Moderately Active population, the population should have greater ModeratelyActiveMinutes than the max of median and average of its respective column. A similar approach is applied for the Lightly Active categorization. If it does not fall into the three classifications, it is Sedentary.
A similar method could be applied when substituting Minutes for Distance, but the results do not change too much (just a slight difference in where each member is categorized - can be seen in the bar chart).

# -- Group by using the binning method -- #
# In the absence of categorical data, we will create factor variables based on the numeric variables.

df_tot_user_mins <- df_tot %>% 
  summarise(
    user_type = factor(case_when(
      VeryActiveMinutes >= max(median(VeryActiveMinutes), mean(VeryActiveMinutes)) ~ "Very Active",
      VeryActiveMinutes < max(median(VeryActiveMinutes), mean(VeryActiveMinutes)) & 
        ModeratelyActiveMinutes >= max(median(ModeratelyActiveMinutes), mean(ModeratelyActiveMinutes))  ~ "Moderately Active",
      VeryActiveMinutes < max(median(VeryActiveMinutes), mean(VeryActiveMinutes)) & 
        ModeratelyActiveMinutes < max(median(ModeratelyActiveMinutes), mean(ModeratelyActiveMinutes)) & 
          LightlyActiveMinutes >= max(median(LightlyActiveMinutes), mean(LightlyActiveMinutes))   ~ "Lightly Active", 
      VeryActiveMinutes < max(median(VeryActiveMinutes), mean(VeryActiveMinutes)) & 
        ModeratelyActiveMinutes < max(median(ModeratelyActiveMinutes), mean(ModeratelyActiveMinutes)) & 
          LightlyActiveMinutes < max(median(LightlyActiveMinutes), mean(LightlyActiveMinutes)) ~ "Sedentary",
        ),
      levels = c("Very Active", "Moderately Active", "Lightly Active", "Sedentary")), 
    Calories, 
    HoursSlept, 
    TotalDistance, 
    TotalIntensity
  )

# Observe the aggregated data categorized by minutes

glimpse(df_tot_user_mins) %>% flextable()
## Rows: 228
## Columns: 5
## $ user_type      <fct> Very Active, Very Active, Very Active, Very Active, Ver…
## $ Calories       <dbl> 1819, 2154, 2154, 1944, 1944, 1932, 1886, 1889, 1868, 1…
## $ HoursSlept     <dbl> 6.42, 7.60, 0.23, 2.13, 5.48, 1.27, 6.28, 5.58, 5.08, 8…
## $ TotalDistance  <dbl> 7.11, 11.55, 11.55, 8.53, 8.53, 8.93, 7.85, 7.86, 7.87,…
## $ TotalIntensity <dbl> 139, 130, 130, 97, 97, 76, 56, 61, 52, 129, 96, 69, 46,…

user_type

Calories

HoursSlept

TotalDistance

TotalIntensity

Very Active

1,819

6.42

7.11

139

Very Active

2,154

7.60

11.55

130

Very Active

2,154

0.23

11.55

130

Very Active

1,944

2.13

8.53

97

Very Active

1,944

5.48

8.53

97

Very Active

1,932

1.27

8.93

76

Very Active

1,886

6.28

7.85

56

Very Active

1,889

5.58

7.86

61

Very Active

1,868

5.08

7.87

52

Very Active

1,843

8.47

7.25

129

Moderately Active

1,850

7.08

6.37

96

Very Active

2,030

5.60

9.80

69

Very Active

2,083

5.13

9.73

46

Very Active

1,883

6.77

8.10

45

Very Active

1,755

7.95

6.98

120

Very Active

1,811

5.47

7.26

67

Sedentary

1,636

3.15

1.88

96

Sedentary

1,807

10.58

3.29

45

Lightly Active

2,783

5.23

3.92

129

Sedentary

2,449

1.83

2.21

96

Sedentary

2,380

3.13

1.63

69

Sedentary

2,344

6.98

1.55

46

Sedentary

2,202

7.13

0.87

122

Sedentary

2,443

0.77

1.75

45

Sedentary

2,442

5.13

1.46

120

Sedentary

2,442

1.03

1.46

120

Sedentary

2,255

2.05

0.84

67

Sedentary

942

2.13

0.02

25

Sedentary

1,407

2.45

1.62

52

Sedentary

1,237

2.55

0.76

129

Sedentary

1,330

2.97

1.43

96

Lightly Active

1,583

2.00

4.14

69

Lightly Active

1,538

2.57

3.50

46

Sedentary

1,421

2.17

1.97

122

Sedentary

1,392

2.95

1.72

80

Sedentary

1,425

2.92

2.33

45

Sedentary

1,413

2.82

1.92

45

Lightly Active

1,515

2.72

3.19

120

Sedentary

1,405

3.18

2.03

67

Moderately Active

2,041

1.82

6.79

56

Moderately Active

2,187

1.57

7.10

99

Lightly Active

1,929

2.28

4.47

61

Very Active

2,438

7.63

10.22

129

Very Active

2,438

0.05

10.22

129

Lightly Active

2,035

2.50

4.95

96

Moderately Active

2,099

2.08

6.80

69

Lightly Active

2,096

0.62

6.82

46

Moderately Active

2,338

1.25

6.85

80

Moderately Active

2,488

0.33

10.63

45

Lightly Active

2,164

1.93

6.83

120

Sedentary

1,288

7.15

3.01

52

Lightly Active

1,490

4.82

5.10

129

Very Active

1,630

5.97

7.70

96

Moderately Active

1,648

6.22

7.99

69

Very Active

1,649

8.08

8.23

46

Very Active

1,783

7.08

10.32

122

Moderately Active

1,431

6.73

5.63

80

Moderately Active

1,524

7.98

6.84

45

Very Active

1,697

6.95

9.76

45

Sedentary

1,232

7.52

2.39

120

Sedentary

1,223

6.90

2.14

67

Lightly Active

2,990

6.00

3.97

98

Lightly Active

2,990

1.12

3.97

98

Sedentary

2,480

5.37

2.31

53

Sedentary

2,570

4.82

2.17

27

Sedentary

3,016

5.67

4.23

53

Very Active

3,830

8.20

8.99

73

Sedentary

3,706

6.77

6.41

70

Lightly Active

3,418

2.43

3.87

99

Lightly Active

3,418

3.98

3.87

99

Moderately Active

3,439

4.45

5.33

61

Moderately Active

3,338

4.08

3.84

52

Moderately Active

3,338

2.92

3.84

52

Moderately Active

3,338

0.45

3.84

52

Sedentary

2,892

3.12

3.10

129

Lightly Active

3,313

5.00

4.38

96

Lightly Active

3,313

1.62

4.38

96

Moderately Active

3,118

7.95

4.60

69

Lightly Active

3,253

4.27

5.36

122

Sedentary

2,817

1.22

3.68

120

Sedentary

2,507

3.75

2.15

67

Sedentary

1,958

1.60

3.79

56

Sedentary

1,958

1.73

3.79

56

Moderately Active

2,129

0.77

5.46

61

Moderately Active

2,216

1.55

6.27

52

Moderately Active

2,216

0.72

6.27

52

Lightly Active

2,154

0.97

5.66

129

Moderately Active

2,178

1.03

5.52

96

Moderately Active

2,308

1.52

6.76

69

Lightly Active

2,201

1.15

5.63

46

Moderately Active

2,341

4.30

7.35

122

Moderately Active

2,341

0.80

7.35

122

Sedentary

2,010

3.75

1.55

56

Sedentary

2,010

4.18

1.55

56

Lightly Active

2,227

4.90

3.22

61

Lightly Active

2,227

2.00

3.22

61

Sedentary

2,133

5.60

2.19

52

Lightly Active

2,317

8.23

3.63

129

Lightly Active

2,217

8.25

2.84

46

Very Active

2,374

7.48

4.78

122

Very Active

2,335

4.45

4.61

80

Lightly Active

2,303

1.68

3.72

45

Lightly Active

2,303

2.17

3.72

45

Lightly Active

2,210

3.38

2.81

45

Lightly Active

2,210

1.15

2.81

45

Lightly Active

2,210

2.80

2.81

45

Lightly Active

2,424

4.87

4.45

120

Lightly Active

2,424

1.25

4.45

120

Very Active

2,297

4.55

4.13

67

Very Active

2,297

2.55

4.13

67

Sedentary

745

5.52

0.19

25

Sedentary

1,680

1.43

2.10

129

Lightly Active

2,886

1.15

5.28

56

Lightly Active

2,915

1.53

6.15

99

Lightly Active

2,895

1.00

5.65

61

Lightly Active

2,923

2.00

5.81

52

Moderately Active

3,323

2.00

8.38

129

Moderately Active

3,357

1.95

9.83

96

Moderately Active

2,931

1.33

6.36

69

Sedentary

2,848

1.32

5.41

46

Lightly Active

2,943

1.75

6.18

122

Sedentary

2,822

1.63

5.34

80

Moderately Active

3,597

7.20

11.36

45

Moderately Active

3,597

2.57

11.36

45

Moderately Active

3,224

2.02

9.03

120

Moderately Active

3,224

0.87

9.03

120

Sedentary

2,677

3.80

5.27

67

Sedentary

1,615

11.58

1.92

129

Sedentary

1,615

3.43

1.92

129

Sedentary

1,481

12.50

0.99

96

Sedentary

1,481

1.47

0.99

96

Very Active

1,892

1.53

5.88

69

Very Active

2,086

0.38

7.38

46

Very Active

2,044

0.58

8.00

122

Very Active

2,249

0.08

10.08

80

Sedentary

1,692

9.12

3.00

45

Sedentary

1,692

2.28

3.00

45

Moderately Active

1,712

10.35

3.65

120

Moderately Active

1,712

1.43

3.65

120

Very Active

2,065

1.65

7.77

67

Very Active

3,625

2.20

7.87

52

Very Active

4,430

5.17

11.11

129

Very Active

4,430

1.17

11.11

129

Very Active

3,427

2.20

7.41

96

Very Active

3,492

2.27

5.56

69

Very Active

3,597

2.17

6.70

46

Very Active

3,597

0.95

6.70

46

Very Active

3,765

1.60

8.30

122

Very Active

2,775

2.35

3.36

80

Sedentary

2,486

2.22

3.26

45

Very Active

3,817

4.43

7.84

45

Very Active

3,817

1.88

7.84

45

Very Active

3,817

0.28

7.84

45

Very Active

3,378

1.53

7.38

120

Lightly Active

2,210

4.28

5.62

52

Lightly Active

2,210

3.93

5.62

52

Lightly Active

2,210

1.50

5.62

52

Lightly Active

2,445

1.05

7.48

46

Lightly Active

2,445

0.02

7.48

46

Lightly Active

2,694

1.72

9.37

122

Lightly Active

2,617

3.73

8.15

45

Lightly Active

2,617

2.95

8.15

45

Lightly Active

2,617

1.05

8.15

45

Very Active

3,065

0.20

5.18

52

Very Active

3,920

7.58

8.43

122

Very Active

3,920

0.17

8.43

122

Very Active

3,856

1.82

10.03

80

Very Active

3,856

2.13

10.03

80

Moderately Active

2,244

6.75

10.24

99

Moderately Active

2,244

0.42

10.24

99

Very Active

2,188

1.90

9.32

61

Very Active

2,188

0.07

9.32

61

Very Active

2,115

6.90

8.30

129

Moderately Active

2,055

8.25

8.14

96

Moderately Active

2,055

1.18

8.14

96

Very Active

2,158

5.90

7.94

46

Very Active

2,158

0.83

7.94

46

Very Active

2,170

0.22

8.86

122

Very Active

2,244

0.33

9.65

80

Very Active

2,231

6.18

8.26

45

Very Active

2,231

1.37

8.26

45

Lightly Active

2,100

7.02

6.71

120

Lightly Active

2,100

0.10

6.71

120

Very Active

917

6.43

3.90

25

Lightly Active

2,575

1.13

7.34

129

Very Active

3,086

1.20

11.24

96

Very Active

2,676

1.15

9.14

46

Very Active

2,840

1.08

10.58

80

Very Active

2,496

0.60

7.67

52

Very Active

2,439

1.50

4.99

46

Sedentary

2,233

1.08

4.73

122

Very Active

2,862

1.75

10.82

80

Very Active

2,519

8.10

4.67

45

Sedentary

1,799

0.73

0.35

120

Very Active

2,543

1.80

3.92

67

Very Active

3,669

2.52

11.24

52

Very Active

3,669

0.30

11.24

52

Sedentary

2,702

3.23

2.66

129

Sedentary

2,702

1.30

2.66

129

Sedentary

2,702

2.03

2.66

129

Very Active

3,304

2.05

7.26

96

Very Active

4,234

3.00

11.05

69

Very Active

4,128

2.10

10.19

46

Very Active

4,128

0.90

10.19

46

Very Active

3,798

1.25

7.24

122

Very Active

3,798

1.40

7.24

122

Very Active

3,839

1.52

7.71

80

Very Active

3,839

1.45

7.71

80

Very Active

3,839

0.87

7.71

80

Very Active

3,713

1.10

6.62

45

Sedentary

2,606

2.95

1.78

45

Sedentary

2,606

1.55

1.78

45

Sedentary

2,624

8.17

2.09

120

Sedentary

2,624

1.65

2.09

120

Very Active

3,775

1.67

7.01

67

Sedentary

2,260

5.63

2.94

52

Sedentary

2,260

2.73

2.94

52

Lightly Active

2,667

8.60

4.63

96

Sedentary

2,229

5.58

2.45

69

Sedentary

2,229

4.30

2.45

69

Sedentary

2,100

6.88

1.49

46

Sedentary

2,114

7.70

1.36

122

Sedentary

1,961

7.90

0.83

80

Sedentary

1,953

2.95

0.94

45

Sedentary

1,953

4.47

0.94

45

Sedentary

1,890

4.67

0.65

45

Sedentary

1,890

4.60

0.65

45

Sedentary

1,890

1.32

0.65

45

summary(df_tot_user_mins) 
##              user_type     Calories      HoursSlept     TotalDistance   
##  Very Active      :80   Min.   : 745   Min.   : 0.020   Min.   : 0.020  
##  Moderately Active:34   1st Qu.:1958   1st Qu.: 1.393   1st Qu.: 3.000  
##  Lightly Active   :47   Median :2260   Median : 2.315   Median : 5.620  
##  Sedentary        :67   Mean   :2462   Mean   : 3.401   Mean   : 5.613  
##                         3rd Qu.:2893   3rd Qu.: 5.265   3rd Qu.: 7.888  
##                         Max.   :4430   Max.   :12.500   Max.   :11.550  
##  TotalIntensity  
##  Min.   : 25.00  
##  1st Qu.: 46.00  
##  Median : 69.00  
##  Mean   : 78.88  
##  3rd Qu.:120.00  
##  Max.   :139.00
df_tot_user_dist <- df_tot %>% 
  summarise(
    user_type = factor(case_when(
      VeryActiveDistance >= max(median(VeryActiveDistance), mean(VeryActiveDistance)) ~ "Very Active",
      VeryActiveDistance < max(median(VeryActiveDistance), mean(VeryActiveDistance)) & 
        ModeratelyActiveDistance >= max(median(ModeratelyActiveDistance), mean(ModeratelyActiveDistance))  ~ "Moderately Active",
      VeryActiveDistance < max(median(VeryActiveDistance), mean(VeryActiveDistance)) & 
        ModeratelyActiveDistance < max(median(ModeratelyActiveDistance), mean(ModeratelyActiveDistance)) & 
        LightlyActiveDistance >= max(median(LightlyActiveDistance), mean(LightlyActiveDistance))   ~ "Lightly Active", 
      VeryActiveDistance < max(median(VeryActiveDistance), mean(VeryActiveDistance)) & 
        ModeratelyActiveDistance < max(median(ModeratelyActiveDistance), mean(ModeratelyActiveDistance)) & 
        LightlyActiveDistance < max(median(LightlyActiveDistance), mean(LightlyActiveDistance)) ~ "Sedentary",
    ),
    levels = c("Very Active", "Moderately Active", "Lightly Active", "Sedentary")), 
    Calories, 
    HoursSlept, 
    TotalDistance, 
    TotalIntensity
  )

# Observe the aggregated data:
glimpse(df_tot_user_dist) %>% flextable()
## Rows: 228
## Columns: 5
## $ user_type      <fct> Very Active, Very Active, Very Active, Very Active, Ver…
## $ Calories       <dbl> 1819, 2154, 2154, 1944, 1944, 1932, 1886, 1889, 1868, 1…
## $ HoursSlept     <dbl> 6.42, 7.60, 0.23, 2.13, 5.48, 1.27, 6.28, 5.58, 5.08, 8…
## $ TotalDistance  <dbl> 7.11, 11.55, 11.55, 8.53, 8.53, 8.93, 7.85, 7.86, 7.87,…
## $ TotalIntensity <dbl> 139, 130, 130, 97, 97, 76, 56, 61, 52, 129, 96, 69, 46,…

user_type

Calories

HoursSlept

TotalDistance

TotalIntensity

Very Active

1,819

6.42

7.11

139

Very Active

2,154

7.60

11.55

130

Very Active

2,154

0.23

11.55

130

Very Active

1,944

2.13

8.53

97

Very Active

1,944

5.48

8.53

97

Very Active

1,932

1.27

8.93

76

Very Active

1,886

6.28

7.85

56

Very Active

1,889

5.58

7.86

61

Very Active

1,868

5.08

7.87

52

Very Active

1,843

8.47

7.25

129

Moderately Active

1,850

7.08

6.37

96

Very Active

2,030

5.60

9.80

69

Very Active

2,083

5.13

9.73

46

Very Active

1,883

6.77

8.10

45

Very Active

1,755

7.95

6.98

120

Very Active

1,811

5.47

7.26

67

Sedentary

1,636

3.15

1.88

96

Sedentary

1,807

10.58

3.29

45

Lightly Active

2,783

5.23

3.92

129

Sedentary

2,449

1.83

2.21

96

Sedentary

2,380

3.13

1.63

69

Sedentary

2,344

6.98

1.55

46

Sedentary

2,202

7.13

0.87

122

Sedentary

2,443

0.77

1.75

45

Sedentary

2,442

5.13

1.46

120

Sedentary

2,442

1.03

1.46

120

Sedentary

2,255

2.05

0.84

67

Sedentary

942

2.13

0.02

25

Sedentary

1,407

2.45

1.62

52

Sedentary

1,237

2.55

0.76

129

Sedentary

1,330

2.97

1.43

96

Lightly Active

1,583

2.00

4.14

69

Sedentary

1,538

2.57

3.50

46

Sedentary

1,421

2.17

1.97

122

Sedentary

1,392

2.95

1.72

80

Sedentary

1,425

2.92

2.33

45

Sedentary

1,413

2.82

1.92

45

Sedentary

1,515

2.72

3.19

120

Sedentary

1,405

3.18

2.03

67

Moderately Active

2,041

1.82

6.79

56

Very Active

2,187

1.57

7.10

99

Lightly Active

1,929

2.28

4.47

61

Very Active

2,438

7.63

10.22

129

Very Active

2,438

0.05

10.22

129

Lightly Active

2,035

2.50

4.95

96

Very Active

2,099

2.08

6.80

69

Very Active

2,096

0.62

6.82

46

Moderately Active

2,338

1.25

6.85

80

Moderately Active

2,488

0.33

10.63

45

Lightly Active

2,164

1.93

6.83

120

Sedentary

1,288

7.15

3.01

52

Lightly Active

1,490

4.82

5.10

129

Very Active

1,630

5.97

7.70

96

Moderately Active

1,648

6.22

7.99

69

Very Active

1,649

8.08

8.23

46

Very Active

1,783

7.08

10.32

122

Moderately Active

1,431

6.73

5.63

80

Moderately Active

1,524

7.98

6.84

45

Very Active

1,697

6.95

9.76

45

Sedentary

1,232

7.52

2.39

120

Sedentary

1,223

6.90

2.14

67

Lightly Active

2,990

6.00

3.97

98

Lightly Active

2,990

1.12

3.97

98

Sedentary

2,480

5.37

2.31

53

Sedentary

2,570

4.82

2.17

27

Lightly Active

3,016

5.67

4.23

53

Very Active

3,830

8.20

8.99

73

Sedentary

3,706

6.77

6.41

70

Lightly Active

3,418

2.43

3.87

99

Lightly Active

3,418

3.98

3.87

99

Moderately Active

3,439

4.45

5.33

61

Sedentary

3,338

4.08

3.84

52

Sedentary

3,338

2.92

3.84

52

Sedentary

3,338

0.45

3.84

52

Sedentary

2,892

3.12

3.10

129

Lightly Active

3,313

5.00

4.38

96

Lightly Active

3,313

1.62

4.38

96

Moderately Active

3,118

7.95

4.60

69

Lightly Active

3,253

4.27

5.36

122

Sedentary

2,817

1.22

3.68

120

Sedentary

2,507

3.75

2.15

67

Sedentary

1,958

1.60

3.79

56

Sedentary

1,958

1.73

3.79

56

Lightly Active

2,129

0.77

5.46

61

Moderately Active

2,216

1.55

6.27

52

Moderately Active

2,216

0.72

6.27

52

Lightly Active

2,154

0.97

5.66

129

Moderately Active

2,178

1.03

5.52

96

Moderately Active

2,308

1.52

6.76

69

Lightly Active

2,201

1.15

5.63

46

Moderately Active

2,341

4.30

7.35

122

Moderately Active

2,341

0.80

7.35

122

Sedentary

2,010

3.75

1.55

56

Sedentary

2,010

4.18

1.55

56

Sedentary

2,227

4.90

3.22

61

Sedentary

2,227

2.00

3.22

61

Sedentary

2,133

5.60

2.19

52

Sedentary

2,317

8.23

3.63

129

Sedentary

2,217

8.25

2.84

46

Very Active

2,374

7.48

4.78

122

Very Active

2,335

4.45

4.61

80

Lightly Active

2,303

1.68

3.72

45

Lightly Active

2,303

2.17

3.72

45

Sedentary

2,210

3.38

2.81

45

Sedentary

2,210

1.15

2.81

45

Sedentary

2,210

2.80

2.81

45

Lightly Active

2,424

4.87

4.45

120

Lightly Active

2,424

1.25

4.45

120

Very Active

2,297

4.55

4.13

67

Very Active

2,297

2.55

4.13

67

Sedentary

745

5.52

0.19

25

Sedentary

1,680

1.43

2.10

129

Lightly Active

2,886

1.15

5.28

56

Lightly Active

2,915

1.53

6.15

99

Lightly Active

2,895

1.00

5.65

61

Lightly Active

2,923

2.00

5.81

52

Moderately Active

3,323

2.00

8.38

129

Moderately Active

3,357

1.95

9.83

96

Moderately Active

2,931

1.33

6.36

69

Moderately Active

2,848

1.32

5.41

46

Lightly Active

2,943

1.75

6.18

122

Lightly Active

2,822

1.63

5.34

80

Moderately Active

3,597

7.20

11.36

45

Moderately Active

3,597

2.57

11.36

45

Moderately Active

3,224

2.02

9.03

120

Moderately Active

3,224

0.87

9.03

120

Lightly Active

2,677

3.80

5.27

67

Sedentary

1,615

11.58

1.92

129

Sedentary

1,615

3.43

1.92

129

Sedentary

1,481

12.50

0.99

96

Sedentary

1,481

1.47

0.99

96

Very Active

1,892

1.53

5.88

69

Very Active

2,086

0.38

7.38

46

Very Active

2,044

0.58

8.00

122

Very Active

2,249

0.08

10.08

80

Sedentary

1,692

9.12

3.00

45

Sedentary

1,692

2.28

3.00

45

Moderately Active

1,712

10.35

3.65

120

Moderately Active

1,712

1.43

3.65

120

Very Active

2,065

1.65

7.77

67

Very Active

3,625

2.20

7.87

52

Very Active

4,430

5.17

11.11

129

Very Active

4,430

1.17

11.11

129

Very Active

3,427

2.20

7.41

96

Very Active

3,492

2.27

5.56

69

Very Active

3,597

2.17

6.70

46

Very Active

3,597

0.95

6.70

46

Very Active

3,765

1.60

8.30

122

Sedentary

2,775

2.35

3.36

80

Sedentary

2,486

2.22

3.26

45

Very Active

3,817

4.43

7.84

45

Very Active

3,817

1.88

7.84

45

Very Active

3,817

0.28

7.84

45

Very Active

3,378

1.53

7.38

120

Lightly Active

2,210

4.28

5.62

52

Lightly Active

2,210

3.93

5.62

52

Lightly Active

2,210

1.50

5.62

52

Lightly Active

2,445

1.05

7.48

46

Lightly Active

2,445

0.02

7.48

46

Lightly Active

2,694

1.72

9.37

122

Lightly Active

2,617

3.73

8.15

45

Lightly Active

2,617

2.95

8.15

45

Lightly Active

2,617

1.05

8.15

45

Very Active

3,065

0.20

5.18

52

Very Active

3,920

7.58

8.43

122

Very Active

3,920

0.17

8.43

122

Very Active

3,856

1.82

10.03

80

Very Active

3,856

2.13

10.03

80

Very Active

2,244

6.75

10.24

99

Very Active

2,244

0.42

10.24

99

Very Active

2,188

1.90

9.32

61

Very Active

2,188

0.07

9.32

61

Very Active

2,115

6.90

8.30

129

Moderately Active

2,055

8.25

8.14

96

Moderately Active

2,055

1.18

8.14

96

Very Active

2,158

5.90

7.94

46

Very Active

2,158

0.83

7.94

46

Very Active

2,170

0.22

8.86

122

Very Active

2,244

0.33

9.65

80

Very Active

2,231

6.18

8.26

45

Very Active

2,231

1.37

8.26

45

Very Active

2,100

7.02

6.71

120

Very Active

2,100

0.10

6.71

120

Very Active

917

6.43

3.90

25

Moderately Active

2,575

1.13

7.34

129

Very Active

3,086

1.20

11.24

96

Very Active

2,676

1.15

9.14

46

Very Active

2,840

1.08

10.58

80

Very Active

2,496

0.60

7.67

52

Very Active

2,439

1.50

4.99

46

Very Active

2,233

1.08

4.73

122

Very Active

2,862

1.75

10.82

80

Sedentary

2,519

8.10

4.67

45

Sedentary

1,799

0.73

0.35

120

Sedentary

2,543

1.80

3.92

67

Very Active

3,669

2.52

11.24

52

Very Active

3,669

0.30

11.24

52

Sedentary

2,702

3.23

2.66

129

Sedentary

2,702

1.30

2.66

129

Sedentary

2,702

2.03

2.66

129

Very Active

3,304

2.05

7.26

96

Very Active

4,234

3.00

11.05

69

Very Active

4,128

2.10

10.19

46

Very Active

4,128

0.90

10.19

46

Lightly Active

3,798

1.25

7.24

122

Lightly Active

3,798

1.40

7.24

122

Very Active

3,839

1.52

7.71

80

Very Active

3,839

1.45

7.71

80

Very Active

3,839

0.87

7.71

80

Very Active

3,713

1.10

6.62

45

Sedentary

2,606

2.95

1.78

45

Sedentary

2,606

1.55

1.78

45

Sedentary

2,624

8.17

2.09

120

Sedentary

2,624

1.65

2.09

120

Very Active

3,775

1.67

7.01

67

Sedentary

2,260

5.63

2.94

52

Sedentary

2,260

2.73

2.94

52

Lightly Active

2,667

8.60

4.63

96

Sedentary

2,229

5.58

2.45

69

Sedentary

2,229

4.30

2.45

69

Sedentary

2,100

6.88

1.49

46

Sedentary

2,114

7.70

1.36

122

Sedentary

1,961

7.90

0.83

80

Sedentary

1,953

2.95

0.94

45

Sedentary

1,953

4.47

0.94

45

Sedentary

1,890

4.67

0.65

45

Sedentary

1,890

4.60

0.65

45

Sedentary

1,890

1.32

0.65

45

summary(df_tot_user_dist)
##              user_type     Calories      HoursSlept     TotalDistance   
##  Very Active      :83   Min.   : 745   Min.   : 0.020   Min.   : 0.020  
##  Moderately Active:28   1st Qu.:1958   1st Qu.: 1.393   1st Qu.: 3.000  
##  Lightly Active   :40   Median :2260   Median : 2.315   Median : 5.620  
##  Sedentary        :77   Mean   :2462   Mean   : 3.401   Mean   : 5.613  
##                         3rd Qu.:2893   3rd Qu.: 5.265   3rd Qu.: 7.888  
##                         Max.   :4430   Max.   :12.500   Max.   :11.550  
##  TotalIntensity  
##  Min.   : 25.00  
##  1st Qu.: 46.00  
##  Median : 69.00  
##  Mean   : 78.88  
##  3rd Qu.:120.00  
##  Max.   :139.00
# -- Correlation tests before grouping

library(flextable) 
df_tot %>% select(-Id, -"ID-AD", -logId) %>% 
dlookr::normality() %>% 
  mutate(across(is.numeric, ~round(., 2))) %>% 
  arrange(statistic) %>% 
  flextable()

vars

statistic

p_value

sample

SedentaryActiveDistance

0.13

0

228

LoggedActivitiesDistance

0.25

0

228

VeryActiveMinutes

0.71

0

228

ModeratelyActiveDistance

0.74

0

228

ModeratelyActiveMinutes

0.74

0

228

VeryActiveDistance

0.75

0

228

TotalIntensity

0.88

0

228

AverageIntensity

0.88

0

228

HoursSlept

0.90

0

228

SedentaryMinutes

0.96

0

228

Calories

0.96

0

228

TotalSteps

0.97

0

228

TotalDistance

0.97

0

228

LightlyActiveDistance

0.98

0

228

LightlyActiveMinutes

0.98

0

228

# -- Apply similar method after grouping -- #
df_tot_user_mins %>% 
  group_by(user_type) %>% 
  dlookr::normality() %>% 
  mutate(across(is.numeric, ~round(., 2))) %>% 
  arrange(statistic) %>% 
  flextable()

variable

user_type

statistic

p_value

sample

TotalIntensity

Lightly Active

0.82

0.00

47

HoursSlept

Moderately Active

0.84

0.00

34

TotalIntensity

Sedentary

0.86

0.00

67

HoursSlept

Very Active

0.87

0.00

80

HoursSlept

Lightly Active

0.88

0.00

47

TotalIntensity

Very Active

0.88

0.00

80

Calories

Very Active

0.90

0.00

80

Calories

Moderately Active

0.90

0.01

34

HoursSlept

Sedentary

0.90

0.00

67

TotalIntensity

Moderately Active

0.90

0.00

34

TotalDistance

Sedentary

0.93

0.00

67

TotalDistance

Lightly Active

0.95

0.04

47

Calories

Lightly Active

0.96

0.07

47

TotalDistance

Very Active

0.96

0.01

80

TotalDistance

Moderately Active

0.96

0.19

34

Calories

Sedentary

0.98

0.50

67

df_tot_user_dist %>% 
  group_by(user_type) %>% 
  dlookr::normality() %>% 
  mutate(across(is.numeric, ~round(., 2))) %>% 
  arrange(statistic) %>% 
  flextable()

variable

user_type

statistic

p_value

sample

HoursSlept

Moderately Active

0.81

0.00

28

TotalIntensity

Sedentary

0.84

0.00

77

HoursSlept

Very Active

0.86

0.00

83

TotalIntensity

Lightly Active

0.86

0.00

40

Calories

Very Active

0.88

0.00

83

HoursSlept

Lightly Active

0.88

0.00

40

TotalIntensity

Very Active

0.89

0.00

83

TotalIntensity

Moderately Active

0.89

0.01

28

HoursSlept

Sedentary

0.90

0.00

77

TotalDistance

Lightly Active

0.91

0.01

40

Calories

Moderately Active

0.94

0.10

28

TotalDistance

Very Active

0.97

0.03

83

TotalDistance

Moderately Active

0.97

0.46

28

TotalDistance

Sedentary

0.97

0.08

77

Calories

Lightly Active

0.98

0.74

40

Calories

Sedentary

0.98

0.39

77

We apply the Shaprio-Wilk test for Normality before and after categorizing based on minutes and notice that TotalDistance for Moderately Active and Calories for Sedentary population are normally distributed (pvalue of 0.19 and 0.50). As a reminder, the null hypothesis is that the distribution is Gaussian (normal) and if the pvalue is less than or equal to 0.05, then the distribution is not Gaussian.

We will now visualize the plots after binning by minutes, which lead to interesting findings:

# Categorize by Minutes
# C:Q - Calories
df <- df_tot_user_mins

# Bar plots to visualize the samples in the dataset.
ggplot(data = df_tot_user_mins, aes(x = user_type, fill = user_type)) + 
  geom_bar(data = df_tot_user_mins, aes(x = fct_infreq(user_type), fill = user_type)) + 
  labs(x = "UserType", y = "Frequency", title = "Bar Chart: Usertype Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df_tot_user_dist, aes(x = user_type, fill = user_type)) + 
  geom_bar(data = df_tot_user_dist, aes(x = fct_infreq(user_type), fill = user_type)) + 
  labs(x = "UserType", y = "Frequency", title = "Bar Chart: Usertype Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

# Q:Q - Correlogram to visualize the correlation between all numeric columns.
df_num <- df %>% select(-user_type) %>% drop_na()
colnames(df_num) <- c("Calories", "Hours Slept", "Total Distance", "Total Intensity")
corr_mat <- cor(as.matrix(df_num))
corrplot(corr_mat, method = 'square', tl.col = 'black') +
  theme_linedraw(base_size = 18)

## NULL
# Boxplots to visualize the distribution.
ggplot(data = df, aes(x = user_type, y = Calories, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Calories", title = "Box Plot: Usertype vs Calories Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
    theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = Calories, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Calories", title = "Box Plot: Usertype vs Calories Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
    theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = Calories, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = Calories, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Calories", y = "Density", title = "Density Plot: Calories Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = Calories, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = Calories, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Calories", y = "Density", title = "Density Plot: Calories Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = Calories, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=Calories)) + 
  labs(x = "Calories", y = "Probability", title = "CDF: Calories Categorized by Minutes", color = "UserType") +
  #facet_wrap( ~ user_type) + 
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18) 

# C:Q Intensity
ggplot(data = df, aes(x = user_type, y = TotalDistance, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Total Distance", title = "Box Plot: Usertype vs Total Distance Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = TotalDistance, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Total Distance", title = "Box Plot: Usertype vs Total Distance Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalDistance, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = TotalDistance, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Total Distance", y = "Density", title = "Density Plot: Total Distance Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalDistance, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = TotalDistance, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Total Distance", y = "Density", title = "Density Plot: Total Distance Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = TotalDistance, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=TotalDistance)) + 
  labs(x = "Total Distance", y = "Probability", title = "CDF: Total Distance Categorized by Minutes", color = "UserType") +
  #facet_wrap( ~ user_type) + 
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18) 

# C:Q HoursSlept
ggplot(data = df, aes(x = user_type, y = HoursSlept, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Hours Slept", title = "Box Plot: Usertype vs Hours Slept Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = HoursSlept, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Hours Slept", title = "Box Plot: Usertype vs Hours Slept Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = HoursSlept, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Hours Slept", y = "Density", title = "Density Plot: Hours Slept Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = HoursSlept, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Hours Slept", y = "Density", title = "Density Plot: Hours Slept Categorized by Minutes", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = HoursSlept, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=HoursSlept)) + 
  labs(x = "Hours Slept", y = "Probability", title = "CDF: Hours Slept Categorized by Minutes", color = "UserType") +
  #facet_wrap( ~ user_type) + 
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18) 

# C:Q Intensity
ggplot(data = df, aes(x = user_type, y = TotalIntensity, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  labs(x = "UserType", y = "Intensity", title = "Box Plot: Usertype vs Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = TotalIntensity, fill = user_type)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Intensity", title = "Box Plot: Usertype vs Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalIntensity, group = user_type, fill = user_type)) + 
  geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Intensity", y = "Density", title = "Density Plot: Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18)

ggplot(data = df, aes(x = TotalIntensity, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  labs(x = "Intensity", y = "Density", title = "Density Plot: Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = TotalIntensity, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=HoursSlept)) + 
  labs(x = "Intensity", y = "Probability", title = "CDF: Intensity Categorized by Distance", color = "UserType") +
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18)

We notice that the Calories is least correlated with Hours Slept and most correlated with Total Distance traveled. Both of the Very Active and Moderately Active densities are bimodally distributed suggesting those that burn a large amount of calories are getting enough sleep or are sleep deprived. Interestingly, those that are Sedentary are most likely to oversleep (get more than 8 hours of daily sleep - seen by the CDF plot of Hours Slept and Stacked Density plot).

Additionally, we see a greater proportion of Sedentary population when binning by Distance compared to Sleep. This implies when we categorize by Distance, there is a greater weight at the tails of the activity levels, however there is not too much of a difference between the other plots.

The Stacked Density plots of Total Distance suggest most of the Sedentary population does not travel more than 3 miles (around 82% by the CDF plot) compared to about 6 miles for the Lightly Active population (at 75th percentile). The gap (width between groups) is narrowed for Moderately Active and Very Active populations. We also see the facet wrap of the Total Distance is Gaussian like for the Moderately Active group and highly skewed left. In contrast, the Total Distance for Sedentary population is skewed right. These plots suggest we that our categorization coincides with our intuition.

The violin and boxplots of the Calories coincide with our preliminary visualizations. The Very Active group is bimodal and the Sedentary group has the smallest third quartile. Getting light activity in the day increases your likelihood of burning more calories and it is not necessary to highly active to burn more than 2000 calories.

Supplemental Visualizations (Binning by Distance)

Similar visualizations can be obtained when binning by distance instead of minutes:

# Categorize by Distance
df <- df_tot_user_dist
ggplot(data = df, aes(x = user_type, y = Calories, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Calories", title = "Box Plot: Usertype vs Calories Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = Calories, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Calories", title = "Box Plot: Usertype vs Calories Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = Calories, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = Calories, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Calories", y = "Density", title = "Density Plot: Calories Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = Calories, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = Calories, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Calories", y = "Density", title = "Density Plot: Calories Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = Calories, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=Calories)) + 
  labs(x = "Calories", y = "Probability", title = "CDF: Calories Categorized by Distance", color = "UserType") +
  #facet_wrap( ~ user_type) + 
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18)

# C:Q Intensity
ggplot(data = df, aes(x = user_type, y = TotalDistance, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Total Distance", title = "Box Plot: Usertype vs Total Distance Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = TotalDistance, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Total Distance", title = "Box Plot: Usertype vs Total Distance Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalDistance, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = TotalDistance, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Total Distance", y = "Density", title = "Density Plot: Total Distance Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalDistance, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = TotalDistance, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Total Distance", y = "Density", title = "Density Plot: Total Distance Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = TotalDistance, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=TotalDistance)) + 
  labs(x = "Total Distance", y = "Probability", title = "CDF: Total Distance Categorized by Distance", color = "UserType") +
  #facet_wrap( ~ user_type) + 
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18) 

# C:Q HoursSlept
ggplot(data = df, aes(x = user_type, y = HoursSlept, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  #geom_boxplot(data = df, aes(x = user_type)) + 
  labs(x = "UserType", y = "Hours Slept", title = "Box Plot: Usertype vs Hours Slept Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = HoursSlept, fill = user_type)) + 
  #geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Hours Slept", title = "Box Plot: Usertype vs Hours Slept Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = HoursSlept, group = user_type, fill = user_type)) + 
  #geom_density(position = "fill", adjust = 1.5) +
  geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Hours Slept", y = "Density", title = "Density Plot: Hours Slept Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18)

ggplot(data = df, aes(x = HoursSlept, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  #geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  labs(x = "Hours Slept", y = "Density", title = "Density Plot: Hours Slept Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = HoursSlept, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=HoursSlept)) + 
  labs(x = "Hours Slept", y = "Probability", title = "CDF: Hours Slept Categorized by Distance", color = "UserType") +
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18)

# C:Q Intensity
ggplot(data = df, aes(x = user_type, y = TotalIntensity, fill = user_type)) + 
  geom_violin(data = df, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  labs(x = "UserType", y = "Intensity", title = "Box Plot: Usertype vs Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = user_type, y = TotalIntensity, fill = user_type)) + 
  geom_boxplot(data = df, aes(x = df$user_type)) + 
  labs(x = "UserType", y = "Intensity", title = "Box Plot: Usertype vs Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data = df, aes(x = TotalIntensity, group = user_type, fill = user_type)) + 
  geom_density(data = df, aes(x = HoursSlept, fill = df$user_type, alpha = 0.2)) + 
  facet_wrap( ~ user_type) +
  labs(x = "Intensity", y = "Density", title = "Density Plot: Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18)

ggplot(data = df, aes(x = TotalIntensity, group = user_type, fill = user_type)) + 
  geom_density(position = "fill", adjust = 1.5) +
  labs(x = "Intensity", y = "Density", title = "Density Plot: Intensity Categorized by Distance", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df$user_type)))) + 
  theme_linedraw(base_size = 18) 

ggplot(data=df, aes(x = TotalIntensity, color = df$user_type)) +
  stat_ecdf(data=df, geom = "step", aes(color=df$user_type, x=HoursSlept)) + 
  labs(x = "Intensity", y = "Probability", title = "CDF: Intensity Categorized by Distance", color = "UserType") +
  scale_color_manual(values = viridis(n = length(unique(df$user_type)))) +
  theme_linedraw(base_size = 18)