Assignment 1 - Data Analysis using R Programming

Question 1: Load the Dataset

health_df <- read.csv("health_lifestyle_dataset.csv")

Question 2: Print the Structure of Your Dataset

str(health_df)

## 'data.frame':    100000 obs. of  16 variables:
##  $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age              : int  56 69 46 32 60 25 78 38 56 75 ...
##  $ gender           : chr  "Male" "Female" "Male" "Female" ...
##  $ bmi              : num  20.5 33.3 31.6 38.2 33.6 27.3 37.1 18.9 18.2 23.5 ...
##  $ daily_steps      : int  4198 14359 1817 15772 6037 19495 16739 1726 1764 9730 ...
##  $ sleep_hours      : num  3.9 9 6.6 3.6 3.8 5 9.5 4.8 5.1 4.5 ...
##  $ water_intake_l   : num  3.4 4.7 4.2 2 4 4.4 4.2 1.7 1.5 0.7 ...
##  $ calories_consumed: int  1602 2346 1643 2460 3756 1301 3478 3212 3740 3571 ...
##  $ smoker           : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ alcohol          : int  0 1 1 0 1 1 0 1 1 1 ...
##  $ resting_hr       : int  97 68 90 71 98 73 90 64 91 54 ...
##  $ systolic_bp      : int  161 116 123 165 139 107 110 113 112 177 ...
##  $ diastolic_bp     : int  111 65 99 95 61 65 102 109 61 90 ...
##  $ cholesterol      : int  240 207 296 175 294 284 201 197 237 157 ...
##  $ family_history   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ disease_risk     : int  0 0 0 0 0 0 0 0 0 0 ...

Question 3: List the Variables in Your Dataset

colnames(health_df)

##  [1] "id"                "age"               "gender"           
##  [4] "bmi"               "daily_steps"       "sleep_hours"      
##  [7] "water_intake_l"    "calories_consumed" "smoker"           
## [10] "alcohol"           "resting_hr"        "systolic_bp"      
## [13] "diastolic_bp"      "cholesterol"       "family_history"   
## [16] "disease_risk"

Question 4: Print the Top 15 Rows of Your Dataset

head(health_df, 15)

Question 5: Write a User-Defined Function

get_activity_level <- function(steps) {
  if (steps < 5000) {
    return("Sedentary")
  } else if (steps >= 5000 & steps < 10000) {
    return("Moderately Active")
  } else {
    return("Active")
  }
}
get_activity_level(health_df$daily_steps[1])

## [1] "Sedentary"

Question 6: Filter Rows Based on Logical Criteria

filtered_data <- health_df %>%
  filter(gender == 'Female' & sleep_hours < 6)
head(filtered_data)

Question 7: Reshape and Join a New Data Frame

demographics_df <- health_df %>% 
  select(id, age, gender, bmi)
health_metrics_df <- health_df %>% 
  select(id, daily_steps, sleep_hours, water_intake_l, disease_risk)
joined_df <- inner_join(demographics_df, health_metrics_df, by = "id")
head(joined_df)

Question 8: Remove Missing Values

print(paste("Total missing values:", sum(is.na(health_df))))

## [1] "Total missing values: 0"

health_df_no_missing <- na.omit(health_df)

Question 9: Identify and Remove Duplicated Data

print(paste("Number of duplicated rows:", sum(duplicated(health_df))))

## [1] "Number of duplicated rows: 0"

health_df_no_duplicates <- health_df %>% 
  distinct()

Question 10: Reorder Multiple Rows

reordered_df <- health_df %>% 
  arrange(desc(age))
head(reordered_df)

Question 11: Rename Some of the Column Names

renamed_df <- health_df %>% 
  rename(StepsPerDay = daily_steps, SleepHours = sleep_hours)
head(renamed_df)

Question 12: Add a New Variable

new_var_df <- health_df %>% 
  mutate(sleep_minutes = sleep_hours * 60)
head(new_var_df)

Question 13: Create a Training Set

set.seed(123)
train_indices <- sample(1:nrow(health_df), 0.75 * nrow(health_df))
training_set <- health_df[train_indices, ]
print(paste("Training set size:", nrow(training_set)))

## [1] "Training set size: 75000"

Question 14: Print the Summary Statistics

summary(health_df)

##        id              age           gender               bmi       
##  Min.   :     1   Min.   :18.00   Length:100000      Min.   :18.00  
##  1st Qu.: 25001   1st Qu.:33.00   Class :character   1st Qu.:23.50  
##  Median : 50000   Median :48.00   Mode  :character   Median :29.00  
##  Mean   : 50000   Mean   :48.53                      Mean   :29.02  
##  3rd Qu.: 75000   3rd Qu.:64.00                      3rd Qu.:34.50  
##  Max.   :100000   Max.   :79.00                      Max.   :40.00  
##   daily_steps     sleep_hours     water_intake_l  calories_consumed
##  Min.   : 1000   Min.   : 3.000   Min.   :0.500   Min.   :1200     
##  1st Qu.: 5729   1st Qu.: 4.700   1st Qu.:1.600   1st Qu.:1906     
##  Median :10468   Median : 6.500   Median :2.800   Median :2603     
##  Mean   :10480   Mean   : 6.492   Mean   :2.751   Mean   :2603     
##  3rd Qu.:15229   3rd Qu.: 8.200   3rd Qu.:3.900   3rd Qu.:3299     
##  Max.   :19999   Max.   :10.000   Max.   :5.000   Max.   :3999     
##      smoker          alcohol      resting_hr     systolic_bp   
##  Min.   :0.0000   Min.   :0.0   Min.   :50.00   Min.   : 90.0  
##  1st Qu.:0.0000   1st Qu.:0.0   1st Qu.:62.00   1st Qu.:112.0  
##  Median :0.0000   Median :0.0   Median :74.00   Median :135.0  
##  Mean   :0.2009   Mean   :0.3   Mean   :74.46   Mean   :134.6  
##  3rd Qu.:0.0000   3rd Qu.:1.0   3rd Qu.:87.00   3rd Qu.:157.0  
##  Max.   :1.0000   Max.   :1.0   Max.   :99.00   Max.   :179.0  
##   diastolic_bp     cholesterol    family_history    disease_risk   
##  Min.   : 60.00   Min.   :150.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 74.00   1st Qu.:187.0   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 89.00   Median :224.0   Median :0.0000   Median :0.0000  
##  Mean   : 89.51   Mean   :224.3   Mean   :0.2992   Mean   :0.2482  
##  3rd Qu.:105.00   3rd Qu.:262.0   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :119.00   Max.   :299.0   Max.   :1.0000   Max.   :1.0000

Question 15: Perform Statistical Functions

mean_sleep <- mean(health_df$sleep_hours)
print(paste("Mean Sleep Hours:", round(mean_sleep, 2)))

## [1] "Mean Sleep Hours: 6.49"

median_sleep <- median(health_df$sleep_hours)
print(paste("Median Sleep Hours:", median_sleep))

## [1] "Median Sleep Hours: 6.5"

get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode_sleep <- get_mode(health_df$sleep_hours)
print(paste("Mode of Sleep Hours:", mode_sleep))

## [1] "Mode of Sleep Hours: 8.5"

range_sleep <- max(health_df$sleep_hours) - min(health_df$sleep_hours)
print(paste("Range of Sleep Hours:", range_sleep))

## [1] "Range of Sleep Hours: 7"

Question 16: Plot a Scatter Plot

ggplot(data = health_df, aes(x = daily_steps, y = sleep_hours)) +
  geom_point(aes(color = gender)) +
  labs(title = "Daily Steps vs. Sleep Hours",
       x = "Daily Steps",
       y = "Sleep Hours",
       color = "Gender") +
  theme_minimal()

Question 17: Plot a Bar Plot

ggplot(data = health_df, aes(x = bmi)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of BMI Categories",
       x = "BMI Category",
       y = "Number of People") +
  theme_minimal()

Question 18: Find the Correlation Between Two Variables

correlation_value <- cor(health_df$resting_hr, health_df$systolic_bp, method = "pearson")
print(paste("Pearson correlation between Resting HR and Systolic BP:", correlation_value))

## [1] "Pearson correlation between Resting HR and Systolic BP: 0.000791123027552913"

Assignment 1 - Data Analysis using R Programming

Group 1

2025-10-17

Question 1: Load the Dataset

Question 2: Print the Structure of Your Dataset

Question 3: List the Variables in Your Dataset

Question 4: Print the Top 15 Rows of Your Dataset

Question 5: Write a User-Defined Function

Question 6: Filter Rows Based on Logical Criteria

Question 7: Reshape and Join a New Data Frame

Question 8: Remove Missing Values

Question 9: Identify and Remove Duplicated Data

Question 10: Reorder Multiple Rows

Question 11: Rename Some of the Column Names

Question 12: Add a New Variable

Question 13: Create a Training Set

Question 14: Print the Summary Statistics

Question 15: Perform Statistical Functions

Question 16: Plot a Scatter Plot

Question 17: Plot a Bar Plot

Question 18: Find the Correlation Between Two Variables