Morgan State University

Department of Information Science & Systems

Fall 2024

INSS 615: Data Wrangling for Visualization

Name: Frenandez Lawrence, Mohammed Naveed Afroz Mulla, Onyebuchi Obiefuna

Due: November 24, 2024 (Sunday)

Project Implementation Phases: Phases 2, 3, 4, and 5 (as outlined on pages 4 & 5 of the syllabus) of the project should be completed using R Notebook. After completing phase 5, a knitted version of the document (containing descriptions/explanations of steps, code, and output of phases 2 to 5) should be uploaded by the specified due date. In addition to the final knitted version of the R Notebook, each group is expected to upload a written project report of 2 to 3 pages by the same due date. The report should summarize the project, including challenges faced, lessons learned, and the contributions of each group member.

Phase 2: Data Acquisition and Exploration Loading Dataset and Viewing Structure of the data

# Loading necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Loading the dataset
Obesitydata <- read.csv("/Users/frenandezlawrence/Downloads/Dirty_Obesity_Dataset_.csv")

# Viewing the structure and summary of the data
str(Obesitydata)
## 'data.frame':    1630 obs. of  15 variables:
##  $ Sex                              : chr  " Female " " Female " " Female " " Female " ...
##  $ Age                              : num  18 18 18 18 18 18 19 19 19 19 ...
##  $ Height                           : num  155 158 159 162 165 176 152 158 159 162 ...
##  $ Overweight_Obese_Family          : int  2 2 2 2 2 1 2 2 2 2 ...
##  $ Consumption_of_Fast_Food         : int  2 2 2 2 1 1 2 2 2 2 ...
##  $ Frequency_of_Consuming_Vegetables: int  3 3 2 2 2 1 3 3 2 2 ...
##  $ Number_of_Main_Meals_Daily       : int  1 1 1 2 1 1 1 2 2 2 ...
##  $ Food_Intake_Between_Meals        : int  3 1 3 2 3 4 1 4 2 2 ...
##  $ Smoking                          : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Liquid_Intake_Daily              : num  1 1 3 2 1 2 3 1 1 2 ...
##  $ Calculation_of_Calorie_Intake    : int  2 2 2 2 2 2 2 1 2 1 ...
##  $ Physical_Excercise               : int  3 1 2 1 3 4 2 1 2 2 ...
##  $ Schedule_Dedicated_to_Technology : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Type_of_Transportation_Used      : int  4 3 4 4 2 4 2 3 4 4 ...
##  $ Class                            : int  2 2 2 2 2 2 2 2 2 3 ...
summary(Obesitydata)
##      Sex                 Age             Height      Overweight_Obese_Family
##  Length:1630        Min.   : -5.10   Min.   :-10.3   Min.   :1.000          
##  Class :character   1st Qu.: 25.00   1st Qu.:161.0   1st Qu.:2.000          
##  Mode  :character   Median : 32.00   Median :168.0   Median :2.000          
##                     Mean   : 33.18   Mean   :168.2   Mean   :1.833          
##                     3rd Qu.: 41.00   3rd Qu.:174.0   3rd Qu.:2.000          
##                     Max.   :200.20   Max.   :800.7   Max.   :2.000          
##                                                                             
##  Consumption_of_Fast_Food Frequency_of_Consuming_Vegetables
##  Min.   :1.000            Min.   :1.000                    
##  1st Qu.:1.000            1st Qu.:2.000                    
##  Median :2.000            Median :2.000                    
##  Mean   :1.728            Mean   :2.064                    
##  3rd Qu.:2.000            3rd Qu.:3.000                    
##  Max.   :2.000            Max.   :3.000                    
##  NA's   :10                                                
##  Number_of_Main_Meals_Daily Food_Intake_Between_Meals    Smoking     
##  Min.   :1.000              Min.   :1.000             Min.   :1.000  
##  1st Qu.:1.000              1st Qu.:2.000             1st Qu.:1.000  
##  Median :2.000              Median :2.000             Median :2.000  
##  Mean   :1.872              Mean   :2.394             Mean   :1.695  
##  3rd Qu.:2.000              3rd Qu.:3.000             3rd Qu.:2.000  
##  Max.   :3.000              Max.   :4.000             Max.   :2.000  
##  NA's   :10                                                          
##  Liquid_Intake_Daily Calculation_of_Calorie_Intake Physical_Excercise
##  Min.   :-10.700     Min.   :1.000                 Min.   :1.000     
##  1st Qu.:  1.000     1st Qu.:2.000                 1st Qu.:2.000     
##  Median :  2.000     Median :2.000                 Median :3.000     
##  Mean   :  2.189     Mean   :1.822                 Mean   :3.262     
##  3rd Qu.:  3.000     3rd Qu.:2.000                 3rd Qu.:4.000     
##  Max.   :100.300     Max.   :2.000                 Max.   :5.000     
##                                                    NA's   :5         
##  Schedule_Dedicated_to_Technology Type_of_Transportation_Used     Class      
##  Min.   :1.000                    Min.   :1.000               Min.   :1.000  
##  1st Qu.:2.000                    1st Qu.:1.000               1st Qu.:2.000  
##  Median :2.000                    Median :3.000               Median :3.000  
##  Mean   :2.015                    Mean   :2.666               Mean   :2.677  
##  3rd Qu.:2.750                    3rd Qu.:4.000               3rd Qu.:3.000  
##  Max.   :3.000                    Max.   :5.000               Max.   :4.000  
## 

Phase 2: Data Acqusition and Exploration Checking Missing Values

# Counting missing values in each column
missing_values <- colSums(is.na(Obesitydata))

# Calculating the percentage of missing values
missing_percentage <- (missing_values / nrow(Obesitydata)) * 100

# Combining results into a data frame
missing_summary <- data.frame(
  Column = names(Obesitydata),
  Missing_Values = missing_values,
  Percentage_Missing = missing_percentage
)

# Print missing summary
print(missing_summary)
##                                                              Column
## Sex                                                             Sex
## Age                                                             Age
## Height                                                       Height
## Overweight_Obese_Family                     Overweight_Obese_Family
## Consumption_of_Fast_Food                   Consumption_of_Fast_Food
## Frequency_of_Consuming_Vegetables Frequency_of_Consuming_Vegetables
## Number_of_Main_Meals_Daily               Number_of_Main_Meals_Daily
## Food_Intake_Between_Meals                 Food_Intake_Between_Meals
## Smoking                                                     Smoking
## Liquid_Intake_Daily                             Liquid_Intake_Daily
## Calculation_of_Calorie_Intake         Calculation_of_Calorie_Intake
## Physical_Excercise                               Physical_Excercise
## Schedule_Dedicated_to_Technology   Schedule_Dedicated_to_Technology
## Type_of_Transportation_Used             Type_of_Transportation_Used
## Class                                                         Class
##                                   Missing_Values Percentage_Missing
## Sex                                            0          0.0000000
## Age                                            0          0.0000000
## Height                                         0          0.0000000
## Overweight_Obese_Family                        0          0.0000000
## Consumption_of_Fast_Food                      10          0.6134969
## Frequency_of_Consuming_Vegetables              0          0.0000000
## Number_of_Main_Meals_Daily                    10          0.6134969
## Food_Intake_Between_Meals                      0          0.0000000
## Smoking                                        0          0.0000000
## Liquid_Intake_Daily                            0          0.0000000
## Calculation_of_Calorie_Intake                  0          0.0000000
## Physical_Excercise                             5          0.3067485
## Schedule_Dedicated_to_Technology               0          0.0000000
## Type_of_Transportation_Used                    0          0.0000000
## Class                                          0          0.0000000

Phase 2: Data Acqusition and Exploration Visualizing Missing Data

# Visualize missing data using VIM package
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr_plot <- aggr(Obesitydata, col=c('navyblue', 'red'), 
                  numbers=TRUE, sortVars=TRUE, 
                  labels=names(Obesitydata), 
                  cex.axis=0.7, gap=3, 
                  ylab=c("Histogram of Obesity Missing Data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##                           Variable       Count
##           Consumption_of_Fast_Food 0.006134969
##         Number_of_Main_Meals_Daily 0.006134969
##                 Physical_Excercise 0.003067485
##                                Sex 0.000000000
##                                Age 0.000000000
##                             Height 0.000000000
##            Overweight_Obese_Family 0.000000000
##  Frequency_of_Consuming_Vegetables 0.000000000
##          Food_Intake_Between_Meals 0.000000000
##                            Smoking 0.000000000
##                Liquid_Intake_Daily 0.000000000
##      Calculation_of_Calorie_Intake 0.000000000
##   Schedule_Dedicated_to_Technology 0.000000000
##        Type_of_Transportation_Used 0.000000000
##                              Class 0.000000000

Phase 2: Data Acqusition and Exploration Summary Statistic for Numeric Fields

# Select numeric columns
numeric_data <- Obesitydata %>%
  select_if(is.numeric)

# Summary statistics
summary(numeric_data)
##       Age             Height      Overweight_Obese_Family
##  Min.   : -5.10   Min.   :-10.3   Min.   :1.000          
##  1st Qu.: 25.00   1st Qu.:161.0   1st Qu.:2.000          
##  Median : 32.00   Median :168.0   Median :2.000          
##  Mean   : 33.18   Mean   :168.2   Mean   :1.833          
##  3rd Qu.: 41.00   3rd Qu.:174.0   3rd Qu.:2.000          
##  Max.   :200.20   Max.   :800.7   Max.   :2.000          
##                                                          
##  Consumption_of_Fast_Food Frequency_of_Consuming_Vegetables
##  Min.   :1.000            Min.   :1.000                    
##  1st Qu.:1.000            1st Qu.:2.000                    
##  Median :2.000            Median :2.000                    
##  Mean   :1.728            Mean   :2.064                    
##  3rd Qu.:2.000            3rd Qu.:3.000                    
##  Max.   :2.000            Max.   :3.000                    
##  NA's   :10                                                
##  Number_of_Main_Meals_Daily Food_Intake_Between_Meals    Smoking     
##  Min.   :1.000              Min.   :1.000             Min.   :1.000  
##  1st Qu.:1.000              1st Qu.:2.000             1st Qu.:1.000  
##  Median :2.000              Median :2.000             Median :2.000  
##  Mean   :1.872              Mean   :2.394             Mean   :1.695  
##  3rd Qu.:2.000              3rd Qu.:3.000             3rd Qu.:2.000  
##  Max.   :3.000              Max.   :4.000             Max.   :2.000  
##  NA's   :10                                                          
##  Liquid_Intake_Daily Calculation_of_Calorie_Intake Physical_Excercise
##  Min.   :-10.700     Min.   :1.000                 Min.   :1.000     
##  1st Qu.:  1.000     1st Qu.:2.000                 1st Qu.:2.000     
##  Median :  2.000     Median :2.000                 Median :3.000     
##  Mean   :  2.189     Mean   :1.822                 Mean   :3.262     
##  3rd Qu.:  3.000     3rd Qu.:2.000                 3rd Qu.:4.000     
##  Max.   :100.300     Max.   :2.000                 Max.   :5.000     
##                                                    NA's   :5         
##  Schedule_Dedicated_to_Technology Type_of_Transportation_Used     Class      
##  Min.   :1.000                    Min.   :1.000               Min.   :1.000  
##  1st Qu.:2.000                    1st Qu.:1.000               1st Qu.:2.000  
##  Median :2.000                    Median :3.000               Median :3.000  
##  Mean   :2.015                    Mean   :2.666               Mean   :2.677  
##  3rd Qu.:2.750                    3rd Qu.:4.000               3rd Qu.:3.000  
##  Max.   :3.000                    Max.   :5.000               Max.   :4.000  
## 

Phase 2: Data Acqusition and Exploration Exploratory Visualizations: Histogram of Age

ggplot(Obesitydata, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "white") +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency")

Exploratory Visualizations: Boxplot for Height

ggplot(Obesitydata, aes(y = Height)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Boxplot of Height", y = "Height (cm)")

Exploratory Visualizations: Scatterplot of Age vs Height

ggplot(Obesitydata, aes(x = Age, y = Height)) +
  geom_point(alpha = 0.6) +
  labs(title = "Age vs. Height", x = "Age (In Years)", y = "Height (cm)")

Phase 3: Data Cleaning and Transformation Handling Missing Values

#As we have learnt throughout the course for handling missing data if it is less than 5% of data we can simply omit. Therefore we omitted missing values from our dataset since it is less than 5%.

Obesitydata_cleaned <- na.omit(Obesitydata)

Phase 3: Data Cleaning and Transformation Outlier Detection

# Calculating IQR for a Height
Q1 <- quantile(Obesitydata_cleaned$Height, 0.25, na.rm = TRUE)
Q3 <- quantile(Obesitydata_cleaned$Height, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

# Determining Height outliers
outliers <- Obesitydata_cleaned$Height[Obesitydata_cleaned$Height < (Q1 - 1.5 * IQR) | Obesitydata_cleaned$Height > (Q3 + 1.5 * IQR)]
print(outliers)
## [1] 800.7 500.5 -10.3
# Calculating IQR for Age
AgeQ1 <- quantile(Obesitydata_cleaned$Age, 0.25, na.rm = TRUE)
AgeQ3 <- quantile(Obesitydata_cleaned$Age, 0.75, na.rm = TRUE)
AgeIQR <- AgeQ3 - AgeQ1

# Determining Age outliers
Ageoutliers <- Obesitydata_cleaned$Age[Obesitydata_cleaned$Age < (AgeQ1 - 1.5 * AgeIQR) | Obesitydata_cleaned$Age > (AgeQ3 + 1.5 * AgeIQR)]

# Printing the Age outliers
print(Ageoutliers)
## [1]  -5.1 200.2
# Calculating IQR for Liquid Intake
LiquidQ1 <- quantile(Obesitydata_cleaned$Liquid_Intake_Daily, 0.25, na.rm = TRUE)
LiquidQ3 <- quantile(Obesitydata_cleaned$Liquid_Intake_Daily, 0.75, na.rm = TRUE)
LiquidIQR <- LiquidQ3 - LiquidQ1

# Determining Liquid Intake Outliers
Liquidoutliers <- Obesitydata_cleaned$Liquid_Intake_Daily[Obesitydata_cleaned$Liquid_Intake_Daily < (LiquidQ1 - 1.5 * LiquidIQR) | Obesitydata_cleaned$Liquid_Intake_Daily > (LiquidQ3 + 1.5 * LiquidIQR)]

# Printing the Liquid Intake outliers
print(Liquidoutliers)
## [1]  50.5 -10.7 100.3

Phase 3: Data Cleaning and Transformation Outlier Removal

#Obesitydata_no_outliers$Age <- as.numeric(Obesitydata_no_outliers$Age)

Obesitydata_no_outliers <- Obesitydata_cleaned %>%
  filter(Obesitydata_cleaned$Height >= (Q1 - 1.5 * IQR) & Obesitydata_cleaned$Height<= (Q3 + 1.5 * IQR) & 
           Obesitydata_cleaned$Age >= (AgeQ1 - 1.5 * AgeIQR) & Obesitydata_cleaned$Age <= (AgeQ3 + 1.5 * AgeIQR) &
           Obesitydata_cleaned$Liquid_Intake_Daily >= (LiquidQ1 - 1.5 * LiquidIQR) & Obesitydata_cleaned$Liquid_Intake_Daily <= (LiquidQ3 + 1.5 * LiquidIQR))

Phase 3: Data Cleaning and Transformation Data Conversion of Liquid Intake and Age From Dbl to Int

view(Obesitydata_no_outliers)
Obesitydata_no_outliers$Age <- as.numeric(Obesitydata_no_outliers$Age)
Obesitydata_no_outliers$Liquid_Intake_Daily <- as.numeric(Obesitydata_no_outliers$Liquid_Intake_Daily)

Advance Visualizations: Correlation Heatmap

library(corrplot)
## corrplot 0.94 loaded
# Select numeric columns
numeric_data <- Obesitydata_no_outliers %>%
  select_if(is.numeric)

# Calculate correlations
cor_matrix <- cor(numeric_data, use = "complete.obs")

# Plot correlation heatmap
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.7)

Advance Visualizations: Bar Plot For Categorical Variables

ggplot(Obesitydata_no_outliers, aes(x = Sex)) +
  geom_bar(fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Sex", x = "Sex", y = "Count")

Advance Visualizations: Pair Plot (Multivariate Analysis)

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Pair plot for selected numeric variables
ggpairs(Obesitydata_no_outliers, columns = c("Age", "Height", "Liquid_Intake_Daily"))

Phase 4: Data Shaping and Structuring Categorizing Age

#rm(Structured_obesity_data)
#View(Obesitydata_no_outliers)
Structured_obesity_data <- Obesitydata_no_outliers

Structured_obesity_data$Age_Group <- cut(Structured_obesity_data$Age, breaks = c(0, 17, 35, 60, 100),
                                         labels = c("Child", "Young Adult", "Adult", "Senior"))
view(Structured_obesity_data)

summary(Structured_obesity_data)
##      Sex                 Age            Height      Overweight_Obese_Family
##  Length:1602        Min.   :18.00   Min.   :150.0   Min.   :1.00           
##  Class :character   1st Qu.:25.00   1st Qu.:161.0   1st Qu.:2.00           
##  Mode  :character   Median :32.00   Median :168.0   Median :2.00           
##                     Mean   :33.08   Mean   :167.7   Mean   :1.83           
##                     3rd Qu.:41.00   3rd Qu.:174.0   3rd Qu.:2.00           
##                     Max.   :54.00   Max.   :193.0   Max.   :2.00           
##  Consumption_of_Fast_Food Frequency_of_Consuming_Vegetables
##  Min.   :1.000            Min.   :1.000                    
##  1st Qu.:1.000            1st Qu.:1.000                    
##  Median :2.000            Median :2.000                    
##  Mean   :1.727            Mean   :2.064                    
##  3rd Qu.:2.000            3rd Qu.:3.000                    
##  Max.   :2.000            Max.   :3.000                    
##  Number_of_Main_Meals_Daily Food_Intake_Between_Meals    Smoking     
##  Min.   :1.000              Min.   :1.000             Min.   :1.000  
##  1st Qu.:1.000              1st Qu.:2.000             1st Qu.:1.000  
##  Median :2.000              Median :2.000             Median :2.000  
##  Mean   :1.875              Mean   :2.397             Mean   :1.693  
##  3rd Qu.:2.000              3rd Qu.:3.000             3rd Qu.:2.000  
##  Max.   :3.000              Max.   :4.000             Max.   :2.000  
##  Liquid_Intake_Daily Calculation_of_Calorie_Intake Physical_Excercise
##  Min.   :1.000       Min.   :1.000                 Min.   :1.000     
##  1st Qu.:1.000       1st Qu.:2.000                 1st Qu.:2.000     
##  Median :2.000       Median :2.000                 Median :3.000     
##  Mean   :2.108       Mean   :1.822                 Mean   :3.258     
##  3rd Qu.:3.000       3rd Qu.:2.000                 3rd Qu.:4.000     
##  Max.   :3.000       Max.   :2.000                 Max.   :5.000     
##  Schedule_Dedicated_to_Technology Type_of_Transportation_Used     Class      
##  Min.   :1.000                    Min.   :1.000               Min.   :1.000  
##  1st Qu.:2.000                    1st Qu.:1.000               1st Qu.:2.000  
##  Median :2.000                    Median :3.000               Median :3.000  
##  Mean   :2.016                    Mean   :2.665               Mean   :2.677  
##  3rd Qu.:3.000                    3rd Qu.:4.000               3rd Qu.:3.000  
##  Max.   :3.000                    Max.   :5.000               Max.   :4.000  
##        Age_Group  
##  Child      :  0  
##  Young Adult:969  
##  Adult      :633  
##  Senior     :  0  
##                   
## 

Categorizing Overweight Obese Family

Structured_obesity_data$Overweight_Family <- Structured_obesity_data$Overweight_Obese_Family

Structured_obesity_data <- mutate(Structured_obesity_data, Overweight_Family = recode(Structured_obesity_data$Overweight_Obese_Family, '1' = "Yes",'2' = "No"))

Categorizing Consumption of Fast Food

Structured_obesity_data <- mutate(Structured_obesity_data,FastFood_Consumption = recode(Structured_obesity_data$Consumption_of_Fast_Food, '1' = "Yes",'2' = "No"))

Categorizing Frequency of Vegetables

Structured_obesity_data <- mutate(Structured_obesity_data,Vegetable_Intake = recode(Structured_obesity_data$Frequency_of_Consuming_Vegetables, '1' = "Rarely", '2' = 'Sometimes', '3' = "Always"))

Categorizing of Number of Food Intake Between Meals

Structured_obesity_data <- mutate(Structured_obesity_data,Food_Intake = recode(Structured_obesity_data$Food_Intake_Between_Meals, '1' = "Rarely", '2' = "Sometimes", '3' = "Usually", '4' = "Always"))

Categorizing Smokers

Structured_obesity_data <- mutate(Structured_obesity_data,Smoke = recode(Structured_obesity_data$Smoking, '1' = "Yes", '2' = "No"))

Categorizing Liquid Intake

Structured_obesity_data <- mutate(Structured_obesity_data,Liquid_Intake = recode(Structured_obesity_data$Liquid_Intake_Daily, '1' = "Less Than 1 Liter", '2' = "Between 1 and 2 Liters", '3' = "Greater Than 2 Liters"))

Categorizing Calorie Intake

Structured_obesity_data <- mutate(Structured_obesity_data,Calculate_Calorie_Intake = recode(Structured_obesity_data$Calculation_of_Calorie_Intake, '1' = "Yes", '2' = 'No'))

Categorizing of Number of Food Intake Between Meals

Structured_obesity_data <- mutate(Structured_obesity_data,Exercise = recode(Structured_obesity_data$Physical_Excercise, '1' = "No Physical Activity", '2' = '1 to 2 days', '3' = "3 to 4 days", '4' = "5 to 6 days", '5' = "6+ days"))

Categorizing Schedule Dedicated To Technology

Structured_obesity_data <- mutate(Structured_obesity_data,Screen_Time = recode(Structured_obesity_data$Schedule_Dedicated_to_Technology, '1' = "0 to 2 Hours", '2' = "3 to 5 Hours", '3' = "5+ Hours"))

Categorizing Types Of Transportation

Structured_obesity_data <- mutate(Structured_obesity_data,Transportation_Mode = recode(Structured_obesity_data$Type_of_Transportation_Used, '1' = "Automobile", '2' = "MotorBike", '3' = "Bike", '4' = "Public Transportation", '5' = "Walking"))

Categorizing Class

Structured_obesity_data <- mutate(Structured_obesity_data,Obesity_Class = recode(Structured_obesity_data$Class, '1' = "Underweight", '2' = "Normal", '3' = "Overweight", '4' = "Obesity"))

Viewing Dataset Converted With Categorical Variables

#View(Structured_obesity_data)

visualizationData <- Structured_obesity_data[, c( "Age", "Sex", "Height", "Age_Group","Overweight_Family", "FastFood_Consumption", "Vegetable_Intake", "Food_Intake", "Smoke", "Liquid_Intake", "Calculate_Calorie_Intake", "Exercise", "Screen_Time", "Transportation_Mode", "Obesity_Class" )]

#write.csv(visualizationData,"Group3visualizationdata.csv", row.names = FALSE)

Phase 5: Data Visualization After Cleaning and Transforming Data Histogram of Age

# After cleaning by removing outliers and missing data we can see that our age distribution histogram looks different.
ggplot(visualizationData, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "white") +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency")

Boxplot for Height

#Our Box plot looks normal and we can better interpret the distribution of height in our dataset after cleaning and transformation.  

ggplot(visualizationData, aes(y = Height)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Boxplot of Height", y = "Height (cm)")

Scatterplot of Age vs Height

ggplot(visualizationData, aes(x = Age, y = Height)) +
  geom_point(alpha = 0.6) +
  labs(title = "Age vs. Height", x = "Age (Years)", y = "Height (cm)")

Histogram of Obesity Categories

# After cleaning and transformation of the data we are able to see the distribution of obesity categories in the dataset. We can conclude that a most of the individuals in the data set is either over weight or obese. 

ggplot(visualizationData, aes(x = Obesity_Class)) +
  geom_bar(fill = "blue", alpha = 0.7) +
  labs(title = "Distribution of Obesity Category", x = "Obesity Category", y = "Count")

Correlation of Height and Age Using a Heatmap

library(corrplot)

# Select numeric columns
viz_numeric_data <- visualizationData %>%
  select_if(is.numeric)

# Calculate correlations
cor_matrix <- cor(viz_numeric_data, use = "complete.obs")

# Plot correlation heatmap using corrplot
corrplot(cor_matrix, method = "color", 
         col = colorRampPalette(c("red", "white", "green"))(200), 
         type = "lower", 
         addCoef.col = "black",  # Add correlation coefficients to the heatmap
         tl.col = "black",       # Text label color
         title = "Correlation Heatmap of Height and Age")