Question 1: Load the Dataset
health_df <- read.csv("health_lifestyle_dataset.csv")
Question 2: Print the Structure of Your Dataset
str(health_df)
## 'data.frame': 100000 obs. of 16 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 56 69 46 32 60 25 78 38 56 75 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ bmi : num 20.5 33.3 31.6 38.2 33.6 27.3 37.1 18.9 18.2 23.5 ...
## $ daily_steps : int 4198 14359 1817 15772 6037 19495 16739 1726 1764 9730 ...
## $ sleep_hours : num 3.9 9 6.6 3.6 3.8 5 9.5 4.8 5.1 4.5 ...
## $ water_intake_l : num 3.4 4.7 4.2 2 4 4.4 4.2 1.7 1.5 0.7 ...
## $ calories_consumed: int 1602 2346 1643 2460 3756 1301 3478 3212 3740 3571 ...
## $ smoker : int 0 0 0 0 0 0 1 0 0 0 ...
## $ alcohol : int 0 1 1 0 1 1 0 1 1 1 ...
## $ resting_hr : int 97 68 90 71 98 73 90 64 91 54 ...
## $ systolic_bp : int 161 116 123 165 139 107 110 113 112 177 ...
## $ diastolic_bp : int 111 65 99 95 61 65 102 109 61 90 ...
## $ cholesterol : int 240 207 296 175 294 284 201 197 237 157 ...
## $ family_history : int 0 0 0 0 0 0 0 0 1 1 ...
## $ disease_risk : int 0 0 0 0 0 0 0 0 0 0 ...
Question 3: List the Variables in Your Dataset
colnames(health_df)
## [1] "id" "age" "gender"
## [4] "bmi" "daily_steps" "sleep_hours"
## [7] "water_intake_l" "calories_consumed" "smoker"
## [10] "alcohol" "resting_hr" "systolic_bp"
## [13] "diastolic_bp" "cholesterol" "family_history"
## [16] "disease_risk"
Question 4: Print the Top 15 Rows of Your Dataset
head(health_df, 15)
Question 5: Write a User-Defined Function
get_activity_level <- function(steps) {
if (steps < 5000) {
return("Sedentary")
} else if (steps >= 5000 & steps < 10000) {
return("Moderately Active")
} else {
return("Active")
}
}
get_activity_level(health_df$daily_steps[1])
## [1] "Sedentary"
Question 6: Filter Rows Based on Logical Criteria
filtered_data <- health_df %>%
filter(gender == 'Female' & sleep_hours < 6)
head(filtered_data)
Question 7: Reshape and Join a New Data Frame
demographics_df <- health_df %>%
select(id, age, gender, bmi)
health_metrics_df <- health_df %>%
select(id, daily_steps, sleep_hours, water_intake_l, disease_risk)
joined_df <- inner_join(demographics_df, health_metrics_df, by = "id")
head(joined_df)
Question 8: Remove Missing Values
print(paste("Total missing values:", sum(is.na(health_df))))
## [1] "Total missing values: 0"
health_df_no_missing <- na.omit(health_df)
Question 9: Identify and Remove Duplicated Data
print(paste("Number of duplicated rows:", sum(duplicated(health_df))))
## [1] "Number of duplicated rows: 0"
health_df_no_duplicates <- health_df %>%
distinct()
Question 10: Reorder Multiple Rows
reordered_df <- health_df %>%
arrange(desc(age))
head(reordered_df)
Question 11: Rename Some of the Column Names
renamed_df <- health_df %>%
rename(StepsPerDay = daily_steps, SleepHours = sleep_hours)
head(renamed_df)
Question 12: Add a New Variable
new_var_df <- health_df %>%
mutate(sleep_minutes = sleep_hours * 60)
head(new_var_df)
Question 13: Create a Training Set
set.seed(123)
train_indices <- sample(1:nrow(health_df), 0.75 * nrow(health_df))
training_set <- health_df[train_indices, ]
print(paste("Training set size:", nrow(training_set)))
## [1] "Training set size: 75000"
Question 14: Print the Summary Statistics
summary(health_df)
## id age gender bmi
## Min. : 1 Min. :18.00 Length:100000 Min. :18.00
## 1st Qu.: 25001 1st Qu.:33.00 Class :character 1st Qu.:23.50
## Median : 50000 Median :48.00 Mode :character Median :29.00
## Mean : 50000 Mean :48.53 Mean :29.02
## 3rd Qu.: 75000 3rd Qu.:64.00 3rd Qu.:34.50
## Max. :100000 Max. :79.00 Max. :40.00
## daily_steps sleep_hours water_intake_l calories_consumed
## Min. : 1000 Min. : 3.000 Min. :0.500 Min. :1200
## 1st Qu.: 5729 1st Qu.: 4.700 1st Qu.:1.600 1st Qu.:1906
## Median :10468 Median : 6.500 Median :2.800 Median :2603
## Mean :10480 Mean : 6.492 Mean :2.751 Mean :2603
## 3rd Qu.:15229 3rd Qu.: 8.200 3rd Qu.:3.900 3rd Qu.:3299
## Max. :19999 Max. :10.000 Max. :5.000 Max. :3999
## smoker alcohol resting_hr systolic_bp
## Min. :0.0000 Min. :0.0 Min. :50.00 Min. : 90.0
## 1st Qu.:0.0000 1st Qu.:0.0 1st Qu.:62.00 1st Qu.:112.0
## Median :0.0000 Median :0.0 Median :74.00 Median :135.0
## Mean :0.2009 Mean :0.3 Mean :74.46 Mean :134.6
## 3rd Qu.:0.0000 3rd Qu.:1.0 3rd Qu.:87.00 3rd Qu.:157.0
## Max. :1.0000 Max. :1.0 Max. :99.00 Max. :179.0
## diastolic_bp cholesterol family_history disease_risk
## Min. : 60.00 Min. :150.0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 74.00 1st Qu.:187.0 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 89.00 Median :224.0 Median :0.0000 Median :0.0000
## Mean : 89.51 Mean :224.3 Mean :0.2992 Mean :0.2482
## 3rd Qu.:105.00 3rd Qu.:262.0 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :119.00 Max. :299.0 Max. :1.0000 Max. :1.0000
Question 16: Plot a Scatter Plot
ggplot(data = health_df, aes(x = daily_steps, y = sleep_hours)) +
geom_point(aes(color = gender)) +
labs(title = "Daily Steps vs. Sleep Hours",
x = "Daily Steps",
y = "Sleep Hours",
color = "Gender") +
theme_minimal()

Question 17: Plot a Bar Plot
ggplot(data = health_df, aes(x = bmi)) +
geom_bar(fill = "steelblue") +
labs(title = "Distribution of BMI Categories",
x = "BMI Category",
y = "Number of People") +
theme_minimal()

Question 18: Find the Correlation Between Two Variables
correlation_value <- cor(health_df$resting_hr, health_df$systolic_bp, method = "pearson")
print(paste("Pearson correlation between Resting HR and Systolic BP:", correlation_value))
## [1] "Pearson correlation between Resting HR and Systolic BP: 0.000791123027552913"