COMP 4033 Assignment 1 - Data Analysis Using R Programming
healthfitness = read.csv("healthfitness.csv")
- Print the structure of your dataset.
str(healthfitness)
## 'data.frame': 687701 obs. of 22 variables:
## $ participant_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ date : chr "2024-01-01" "2024-01-04" "2024-01-05" "2024-01-07" ...
## $ age : int 56 56 56 56 56 56 56 56 56 56 ...
## $ gender : chr "F" "F" "F" "F" ...
## $ height_cm : num 165 165 165 165 165 ...
## $ weight_kg : num 53.7 53.9 54.2 54.4 54.7 54.9 55.2 55.5 55.7 56 ...
## $ activity_type : chr "Dancing" "Swimming" "Swimming" "Weight Training" ...
## $ duration_minutes : int 41 28 21 99 100 31 97 70 89 115 ...
## $ intensity : chr "Low" "Low" "Medium" "Medium" ...
## $ calories_burned : num 3.3 2.9 2.6 10.7 12.7 6.8 12.4 12.9 19.7 12.8 ...
## $ avg_heart_rate : int 103 102 126 141 112 121 145 99 112 117 ...
## $ hours_sleep : num 6.6 8.1 6.2 7.2 7.1 7.5 6.6 6.1 7.2 5.6 ...
## $ stress_level : int 3 7 7 8 1 10 8 7 7 3 ...
## $ daily_steps : int 7128 7925 7557 11120 5406 10202 5912 9477 9710 7830 ...
## $ hydration_level : num 1.5 1.8 2.7 2.6 1.5 2.2 2.8 1.6 3.3 2 ...
## $ bmi : num 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 ...
## $ resting_heart_rate : num 69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 69.5 ...
## $ blood_pressure_systolic : num 111 111 111 111 111 ...
## $ blood_pressure_diastolic: num 72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 72.9 ...
## $ health_condition : chr "None" "None" "None" "None" ...
## $ smoking_status : chr "Never" "Never" "Never" "Never" ...
## $ fitness_level : num 0.04 0.07 0.09 0.21 0.33 0.37 0.51 0.58 0.68 0.82 ...
- List the variables in your dataset.
names(healthfitness)
## [1] "participant_id" "date"
## [3] "age" "gender"
## [5] "height_cm" "weight_kg"
## [7] "activity_type" "duration_minutes"
## [9] "intensity" "calories_burned"
## [11] "avg_heart_rate" "hours_sleep"
## [13] "stress_level" "daily_steps"
## [15] "hydration_level" "bmi"
## [17] "resting_heart_rate" "blood_pressure_systolic"
## [19] "blood_pressure_diastolic" "health_condition"
## [21] "smoking_status" "fitness_level"
- Print the top 15 rows of your dataset.
head(healthfitness, n=15)
- Write a user defined function using any of the variables from the
data set.
getmode = function(x) {
uniq_x = unique(x) # identifies unique values in the dataset
uniq_x[which.max(tabulate(match(x, uniq_x)))] # calculates how many times each unique value appears
}
# Get mode of average heart rate
mode_avghr = getmode(healthfitness$avg_heart_rate)
print(mode_avghr)
## [1] 126
- Use data manipulation techniques and filter rows based on any
logical criteria that exist in your dataset.
if(!require("tidyverse")) install.packages("tidyverse",repos="https://cloud.r-project.org/")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("tidyverse")
# filter based on resting heart rate > 70 and smoking status = current
smokeheartrate = as.data.frame(healthfitness %>% filter(healthfitness$resting_heart_rate>70,smoking_status=="Current"))
view(smokeheartrate)
- Identify the dependent & independent variables and use reshaping
techniques and create a new data frame by joining those variables from
your dataset.
# Independent - Fitness Level
# Dependent - Age, Average Heart Rate, Activity Duration Minutes, Calories Burned, Hours of Sleep, Stress Level, Daily Steps
healthfitnesslevel = cbind(healthfitness$fitness_level,healthfitness$age,healthfitness$avg_heart_rate,healthfitness$duration_minutes,healthfitness$calories_burned,healthfitness$hours_sleep,healthfitness$stress_level,healthfitness$daily_steps)
healthfitnesslevel = as.data.frame(healthfitnesslevel)
names(healthfitnesslevel)[1] = "Fitness Level"
names(healthfitnesslevel)[2] = "Age"
names(healthfitnesslevel)[3] = "Average Heart Rate"
names(healthfitnesslevel)[4] = "Activity Duration"
names(healthfitnesslevel)[5] = "Calories Burned"
names(healthfitnesslevel)[6] = "Hours of Sleep"
names(healthfitnesslevel)[7] = "Stress Level"
names(healthfitnesslevel)[8] = "Daily Steps"
view(healthfitnesslevel)
- Remove missing values in your dataset.
healthfitnessomit = as.data.frame(healthfitness)
na.omit(healthfitnessomit)
view(healthfitnessomit)
- Identify and remove duplicated data from your dataset.
# find number of unique participants
healthfitunique = healthfitness %>% distinct(participant_id,.keep_all=TRUE)
view(healthfitunique)
- Reorder multiple rows in descending order
fitleveldescend = as.data.frame(healthfitness %>% arrange(desc(healthfitness$fitness_level)))
view(fitleveldescend)
agedescend = as.data.frame(healthfitness %>% arrange(desc(healthfitness$age)))
view(agedescend)
heightdesc = as.data.frame(healthfitness %>% arrange(desc(healthfitness$height_cm)))
view(heightdesc)
weightdesc = as.data.frame(healthfitness %>% arrange(desc(healthfitness$weight_kg)))
view(weightdesc)
- Rename some of the column names in your dataset.
names(healthfitness)[1] = "Participant ID"
names(healthfitness)[5] = "Height (cm)"
names(healthfitness)[6] = "Weight (kg)"
names(healthfitness)[7] = "Activity Type"
names(healthfitness)[8] = "Activity Duration (mins)"
view(healthfitness)
- Add new variables in your data frame by using a mathematical
function (for e.g. – multiply an existing column by 2 and add it as a
new variable to your data frame)
healthfitness$mean_arterial_pressure = c(1/3*healthfitness$blood_pressure_systolic + 2/3*healthfitness$blood_pressure_diastolic)
view(healthfitness)
- Create a training set using a random number generator engine.
# Using datset from Q6
na.omit(healthfitnesslevel)
set.seed(5678)
healthfittrain = as.data.frame(healthfitnesslevel) %>% sample_frac(0.80,replace=FALSE)
view(healthfittrain)
- Print the summary statistics of your dataset.
summary(healthfitness)
## Participant ID date age gender
## Min. : 1 Length:687701 Min. :18.00 Length:687701
## 1st Qu.: 749 Class :character 1st Qu.:30.00 Class :character
## Median :1499 Mode :character Median :42.00 Mode :character
## Mean :1500 Mean :41.66
## 3rd Qu.:2249 3rd Qu.:53.00
## Max. :3000 Max. :64.00
## Height (cm) Weight (kg) Activity Type Activity Duration (mins)
## Min. :145.0 Min. : 45.30 Length:687701 Min. : 20.00
## 1st Qu.:161.7 1st Qu.: 78.20 Class :character 1st Qu.: 45.00
## Median :168.2 Median : 94.60 Mode :character Median : 70.00
## Mean :168.6 Mean : 94.92 Mean : 70.01
## 3rd Qu.:175.3 3rd Qu.:110.50 3rd Qu.: 95.00
## Max. :198.5 Max. :188.40 Max. :120.00
## intensity calories_burned avg_heart_rate hours_sleep
## Length:687701 Min. : 0.80 Min. : 82.0 Min. : 4.000
## Class :character 1st Qu.: 7.80 1st Qu.:118.0 1st Qu.: 6.400
## Mode :character Median :13.00 Median :130.0 Median : 7.000
## Mean :15.38 Mean :131.5 Mean : 7.049
## 3rd Qu.:20.70 3rd Qu.:144.0 3rd Qu.: 7.700
## Max. :92.00 Max. :206.0 Max. :10.000
## stress_level daily_steps hydration_level bmi
## Min. : 1.000 Min. : -419 Min. :1.500 Min. :14.20
## 1st Qu.: 3.000 1st Qu.: 7203 1st Qu.:2.000 1st Qu.:20.10
## Median : 5.000 Median : 8607 Median :2.500 Median :22.40
## Mean : 5.252 Mean : 8628 Mean :2.499 Mean :22.73
## 3rd Qu.: 8.000 3rd Qu.:10027 3rd Qu.:3.000 3rd Qu.:25.10
## Max. :10.000 Max. :17241 Max. :3.500 Max. :38.80
## resting_heart_rate blood_pressure_systolic blood_pressure_diastolic
## Min. :51.10 Min. : 78.0 Min. : 53.70
## 1st Qu.:66.50 1st Qu.:113.1 1st Qu.: 74.60
## Median :70.00 Median :120.2 Median : 80.10
## Mean :70.01 Mean :120.0 Mean : 80.19
## 3rd Qu.:73.50 3rd Qu.:127.0 3rd Qu.: 85.70
## Max. :87.10 Max. :152.7 Max. :112.10
## health_condition smoking_status fitness_level mean_arterial_pressure
## Length:687701 Length:687701 Min. : 0.020 Min. : 70.47
## Class :character Class :character 1st Qu.: 4.770 1st Qu.: 89.10
## Mode :character Mode :character Median : 9.510 Median : 93.60
## Mean : 9.525 Mean : 93.46
## 3rd Qu.:14.230 3rd Qu.: 97.70
## Max. :21.930 Max. :117.37
- Use any of the numerical variables from the dataset and perform the
following statistical functions: Mean, Median, Mode, Range
# Mean (weight)
mean_weight = mean(healthfitness$`Weight (kg)`)
print(mean_weight)
## [1] 94.92198
# Median (age)
median_age = median(healthfitness$age)
print(median_age)
## [1] 42
# Mode (height), using function from Q4
mode_height = getmode(healthfitness$`Height (cm)`)
print(mode_height)
## [1] 162
# Range (hours of sleep)
range_hours_sleep = range(healthfitness$hours_sleep)
print(range_hours_sleep)
## [1] 4 10
- Plot a scatter plot for any 2 variables in your dataset.
if(!require("ggplot2")) install.packages("ggplot2",repos="https://cloud.r-project.org/")
library(ggplot2)
ggplot(data=healthfitness,aes(x=daily_steps,y=fitness_level))+geom_point(fill="plum1",shape=21)

- Plot a bar plot for any 2 variables in your dataset
ggplot(data=healthfitness,aes(x=daily_steps))+geom_bar(fill="plum1")

ggplot(data=healthfitness,aes(x=hours_sleep))+geom_bar(fill="plum1")

- Find the correlation between any 2 variables by applying least
square linear regression model.
stepsfitnesscorr = cor(healthfitness$fitness_level,healthfitness$daily_steps, method="pearson")
print(stepsfitnesscorr)
## [1] -0.4495721