COMP 4033 Assignment 1 - Data Analysis Using R Programming

healthfitness = read.csv("healthfitness.csv")
  1. Print the structure of your dataset.
str(healthfitness)
## 'data.frame':    687701 obs. of  22 variables:
##  $ participant_id          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ date                    : chr  "2024-01-01" "2024-01-04" "2024-01-05" "2024-01-07" ...
##  $ age                     : int  56 56 56 56 56 56 56 56 56 56 ...
##  $ gender                  : chr  "F" "F" "F" "F" ...
##  $ height_cm               : num  165 165 165 165 165 ...
##  $ weight_kg               : num  53.7 53.9 54.2 54.4 54.7 54.9 55.2 55.5 55.7 56 ...
##  $ activity_type           : chr  "Dancing" "Swimming" "Swimming" "Weight Training" ...
##  $ duration_minutes        : int  41 28 21 99 100 31 97 70 89 115 ...
##  $ intensity               : chr  "Low" "Low" "Medium" "Medium" ...
##  $ calories_burned         : num  3.3 2.9 2.6 10.7 12.7 6.8 12.4 12.9 19.7 12.8 ...
##  $ avg_heart_rate          : int  103 102 126 141 112 121 145 99 112 117 ...
##  $ hours_sleep             : num  6.6 8.1 6.2 7.2 7.1 7.5 6.6 6.1 7.2 5.6 ...
##  $ stress_level            : int  3 7 7 8 1 10 8 7 7 3 ...
##  $ daily_steps             : int  7128 7925 7557 11120 5406 10202 5912 9477 9710 7830 ...
##  $ hydration_level         : num  1.5 1.8 2.7 2.6 1.5 2.2 2.8 1.6 3.3 2 ...
##  $ bmi                     : num  19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 ...
##  $ resting_heart_rate      : num  69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 ...
##  $ blood_pressure_systolic : num  111 111 111 111 111 ...
##  $ blood_pressure_diastolic: num  72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 ...
##  $ health_condition        : chr  "None" "None" "None" "None" ...
##  $ smoking_status          : chr  "Never" "Never" "Never" "Never" ...
##  $ fitness_level           : num  0.04 0.07 0.09 0.21 0.33 0.37 0.51 0.58 0.68 0.82 ...
  1. List the variables in your dataset.
names(healthfitness)
##  [1] "participant_id"           "date"                    
##  [3] "age"                      "gender"                  
##  [5] "height_cm"                "weight_kg"               
##  [7] "activity_type"            "duration_minutes"        
##  [9] "intensity"                "calories_burned"         
## [11] "avg_heart_rate"           "hours_sleep"             
## [13] "stress_level"             "daily_steps"             
## [15] "hydration_level"          "bmi"                     
## [17] "resting_heart_rate"       "blood_pressure_systolic" 
## [19] "blood_pressure_diastolic" "health_condition"        
## [21] "smoking_status"           "fitness_level"
  1. Print the top 15 rows of your dataset.
head(healthfitness, n=15)
  1. Write a user defined function using any of the variables from the data set.
getmode = function(x) {
  uniq_x = unique(x) # identifies unique values in the dataset
  uniq_x[which.max(tabulate(match(x, uniq_x)))] # calculates how many times each unique value appears
}
# Get mode of average heart rate
mode_avghr = getmode(healthfitness$avg_heart_rate)
print(mode_avghr)
## [1] 126
  1. Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
if(!require("tidyverse")) install.packages("tidyverse",repos="https://cloud.r-project.org/")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("tidyverse")
# filter based on resting heart rate > 70 and smoking status = current
smokeheartrate = as.data.frame(healthfitness %>% filter(healthfitness$resting_heart_rate>70,smoking_status=="Current"))
view(smokeheartrate)
  1. Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
# Independent - Fitness Level
# Dependent - Age, Average Heart Rate, Activity Duration Minutes, Calories Burned, Hours of Sleep, Stress Level, Daily Steps
healthfitnesslevel = cbind(healthfitness$fitness_level,healthfitness$age,healthfitness$avg_heart_rate,healthfitness$duration_minutes,healthfitness$calories_burned,healthfitness$hours_sleep,healthfitness$stress_level,healthfitness$daily_steps)
healthfitnesslevel = as.data.frame(healthfitnesslevel)
names(healthfitnesslevel)[1] = "Fitness Level"
names(healthfitnesslevel)[2] = "Age"
names(healthfitnesslevel)[3] = "Average Heart Rate"
names(healthfitnesslevel)[4] = "Activity Duration"
names(healthfitnesslevel)[5] = "Calories Burned"
names(healthfitnesslevel)[6] = "Hours of Sleep"
names(healthfitnesslevel)[7] = "Stress Level"
names(healthfitnesslevel)[8] = "Daily Steps"
view(healthfitnesslevel)
  1. Remove missing values in your dataset.
healthfitnessomit = as.data.frame(healthfitness)
na.omit(healthfitnessomit)
view(healthfitnessomit)
  1. Identify and remove duplicated data from your dataset.
# find number of unique participants
healthfitunique = healthfitness %>% distinct(participant_id,.keep_all=TRUE)
view(healthfitunique)
  1. Reorder multiple rows in descending order
fitleveldescend = as.data.frame(healthfitness %>% arrange(desc(healthfitness$fitness_level)))
view(fitleveldescend)
agedescend = as.data.frame(healthfitness %>% arrange(desc(healthfitness$age)))
view(agedescend)
heightdesc = as.data.frame(healthfitness %>% arrange(desc(healthfitness$height_cm)))
view(heightdesc)
weightdesc = as.data.frame(healthfitness %>% arrange(desc(healthfitness$weight_kg)))
view(weightdesc)
  1. Rename some of the column names in your dataset.
names(healthfitness)[1] = "Participant ID"
names(healthfitness)[5] = "Height (cm)"
names(healthfitness)[6] = "Weight (kg)"
names(healthfitness)[7] = "Activity Type"
names(healthfitness)[8] = "Activity Duration (mins)"
view(healthfitness)
  1. Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)
healthfitness$mean_arterial_pressure = c(1/3*healthfitness$blood_pressure_systolic + 2/3*healthfitness$blood_pressure_diastolic)
view(healthfitness)
  1. Create a training set using a random number generator engine.
# Using datset from Q6 
na.omit(healthfitnesslevel)
set.seed(5678)
healthfittrain = as.data.frame(healthfitnesslevel) %>% sample_frac(0.80,replace=FALSE)
view(healthfittrain)
  1. Print the summary statistics of your dataset.
summary(healthfitness)
##  Participant ID     date                age           gender         
##  Min.   :   1   Length:687701      Min.   :18.00   Length:687701     
##  1st Qu.: 749   Class :character   1st Qu.:30.00   Class :character  
##  Median :1499   Mode  :character   Median :42.00   Mode  :character  
##  Mean   :1500                      Mean   :41.66                     
##  3rd Qu.:2249                      3rd Qu.:53.00                     
##  Max.   :3000                      Max.   :64.00                     
##   Height (cm)     Weight (kg)     Activity Type      Activity Duration (mins)
##  Min.   :145.0   Min.   : 45.30   Length:687701      Min.   : 20.00          
##  1st Qu.:161.7   1st Qu.: 78.20   Class :character   1st Qu.: 45.00          
##  Median :168.2   Median : 94.60   Mode  :character   Median : 70.00          
##  Mean   :168.6   Mean   : 94.92                      Mean   : 70.01          
##  3rd Qu.:175.3   3rd Qu.:110.50                      3rd Qu.: 95.00          
##  Max.   :198.5   Max.   :188.40                      Max.   :120.00          
##   intensity         calories_burned avg_heart_rate   hours_sleep    
##  Length:687701      Min.   : 0.80   Min.   : 82.0   Min.   : 4.000  
##  Class :character   1st Qu.: 7.80   1st Qu.:118.0   1st Qu.: 6.400  
##  Mode  :character   Median :13.00   Median :130.0   Median : 7.000  
##                     Mean   :15.38   Mean   :131.5   Mean   : 7.049  
##                     3rd Qu.:20.70   3rd Qu.:144.0   3rd Qu.: 7.700  
##                     Max.   :92.00   Max.   :206.0   Max.   :10.000  
##   stress_level     daily_steps    hydration_level      bmi       
##  Min.   : 1.000   Min.   : -419   Min.   :1.500   Min.   :14.20  
##  1st Qu.: 3.000   1st Qu.: 7203   1st Qu.:2.000   1st Qu.:20.10  
##  Median : 5.000   Median : 8607   Median :2.500   Median :22.40  
##  Mean   : 5.252   Mean   : 8628   Mean   :2.499   Mean   :22.73  
##  3rd Qu.: 8.000   3rd Qu.:10027   3rd Qu.:3.000   3rd Qu.:25.10  
##  Max.   :10.000   Max.   :17241   Max.   :3.500   Max.   :38.80  
##  resting_heart_rate blood_pressure_systolic blood_pressure_diastolic
##  Min.   :51.10      Min.   : 78.0           Min.   : 53.70          
##  1st Qu.:66.50      1st Qu.:113.1           1st Qu.: 74.60          
##  Median :70.00      Median :120.2           Median : 80.10          
##  Mean   :70.01      Mean   :120.0           Mean   : 80.19          
##  3rd Qu.:73.50      3rd Qu.:127.0           3rd Qu.: 85.70          
##  Max.   :87.10      Max.   :152.7           Max.   :112.10          
##  health_condition   smoking_status     fitness_level    mean_arterial_pressure
##  Length:687701      Length:687701      Min.   : 0.020   Min.   : 70.47        
##  Class :character   Class :character   1st Qu.: 4.770   1st Qu.: 89.10        
##  Mode  :character   Mode  :character   Median : 9.510   Median : 93.60        
##                                        Mean   : 9.525   Mean   : 93.46        
##                                        3rd Qu.:14.230   3rd Qu.: 97.70        
##                                        Max.   :21.930   Max.   :117.37
  1. Use any of the numerical variables from the dataset and perform the following statistical functions: Mean, Median, Mode, Range
# Mean (weight)
mean_weight = mean(healthfitness$`Weight (kg)`)
print(mean_weight)
## [1] 94.92198
# Median (age)
median_age = median(healthfitness$age)
print(median_age)
## [1] 42
# Mode (height), using function from Q4
mode_height = getmode(healthfitness$`Height (cm)`)
print(mode_height)
## [1] 162
# Range (hours of sleep)
range_hours_sleep = range(healthfitness$hours_sleep)
print(range_hours_sleep)
## [1]  4 10
  1. Plot a scatter plot for any 2 variables in your dataset.
if(!require("ggplot2")) install.packages("ggplot2",repos="https://cloud.r-project.org/")
library(ggplot2)
ggplot(data=healthfitness,aes(x=daily_steps,y=fitness_level))+geom_point(fill="plum1",shape=21)

  1. Plot a bar plot for any 2 variables in your dataset
ggplot(data=healthfitness,aes(x=daily_steps))+geom_bar(fill="plum1")

ggplot(data=healthfitness,aes(x=hours_sleep))+geom_bar(fill="plum1")

  1. Find the correlation between any 2 variables by applying least square linear regression model.
stepsfitnesscorr = cor(healthfitness$fitness_level,healthfitness$daily_steps, method="pearson")
print(stepsfitnesscorr)
## [1] -0.4495721