STEP 1: Import the Data File

We import the dataset and disable scientific notation for better readability.

# Load the dataset from a CSV file into a variable named 'raw_data'
raw_data <- read.csv("df.gr1.csv", header = TRUE)

# Prevent R from showing large numbers in scientific notation (like 1e+08)
options(scipen = 999)

STEP 2: Initial Data Description

Initial exploration to detect missing values (NAs) and suspicious outliers.

# Provide a statistical summary (Min, Max, Mean, NAs ...) for all variables to detect issues
summary(raw_data)
##        X              bmi              age           alcohol         
##  Min.   :    1   Min.   :  7.60   Min.   :  2.00   Length:31998      
##  1st Qu.: 8000   1st Qu.: 20.69   1st Qu.: 55.00   Class :character  
##  Median :16000   Median : 23.77   Median : 62.00   Mode  :character  
##  Mean   :16000   Mean   : 24.64   Mean   : 63.33                     
##  3rd Qu.:23999   3rd Qu.: 27.40   3rd Qu.: 70.00                     
##  Max.   :31998   Max.   :116.10   Max.   :166.00                     
##                                                                      
##      height       depression           angina              sex           
##  Min.   : 74.0   Length:31998       Length:31998       Length:31998      
##  1st Qu.:153.0   Class :character   Class :character   Class :character  
##  Median :160.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :159.4                                                           
##  3rd Qu.:167.0                                                           
##  Max.   :210.0                                                           
##  NA's   :12282                                                           
##      county          income              shoesize         cars          
##  Min.   : 1.00   Min.   :-187985831   Min.   :16.71   Length:31998      
##  1st Qu.: 6.00   1st Qu.:     17333   1st Qu.:34.29   Class :character  
##  Median :11.00   Median :     26646   Median :36.65   Mode  :character  
##  Mean   :11.07   Mean   :     46809   Mean   :36.56                     
##  3rd Qu.:16.00   3rd Qu.:     47424   3rd Qu.:38.98                     
##  Max.   :21.00   Max.   : 290151741   Max.   :51.61                     
##                                       NA's   :12282
# Show the structure of the dataset (variable types and first few values)
str(raw_data)
## 'data.frame':    31998 obs. of  12 variables:
##  $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ bmi       : num  25 23.5 23.8 24.5 25.4 ...
##  $ age       : int  66 55 70 66 52 55 53 60 67 82 ...
##  $ alcohol   : chr  "Abstainers" "Abstainers" "Abstainers" "Abstainers" ...
##  $ height    : int  155 171 165 163 172 158 140 NA 148 155 ...
##  $ depression: chr  "no" "no" "no" "no" ...
##  $ angina    : chr  "yes" "no" "yes" "yes" ...
##  $ sex       : chr  "Female" "Male" "Male" "Female" ...
##  $ county    : int  13 13 13 13 13 13 13 13 13 13 ...
##  $ income    : num  47861 37207 24717 25832 23573 ...
##  $ shoesize  : num  31.3 37.7 40.2 33.5 38.5 ...
##  $ cars      : chr  "" "" "" "" ...

STEP 2: Raw Data Visualization

Visual inspection of the raw age distribution and income.

# Divide the plot area into 1 row and 2 columns to show two charts side-by-side
par(mfrow = c(1, 2))

# Create a histogram to visualize the frequency distribution of Age
hist(raw_data$age, 
     main = "Raw Age Distribution", 
     xlab = "Age (Years)", 
     col = "gray")

# Create a boxplot to identify extreme wealth outliers or negative values in Income
boxplot(raw_data$income, 
        main = "Raw Income Check", 
        ylab = "Income", 
        col = "lightgray")

# Reset the plot area back to the default layout (1 row and 1 column)
par(mfrow = c(1, 1))

STEP 3: Data Cleaning

Filtering data to remove errors while preserving valid observations.

# Create a copy of the raw data to perform cleaning without losing the original file
clean_data <- raw_data

# Filter Age: Keep only records where age is between 18 and 100 years
clean_data <- clean_data[clean_data$age >= 18 & clean_data$age <= 100, ]

# Filter Income: Remove any rows with negative income values as they are impossible
clean_data <- clean_data[clean_data$income >= 0, ]

# Filter BMI: Remove extreme values (above 70) which are likely data entry errors
clean_data <- clean_data[clean_data$bmi < 70, ]

# Filter Height: Keep rows if the height is missing (NA) OR if it falls within a realistic range
clean_data <- clean_data[is.na(clean_data$height) | 
                         (clean_data$height >= 120 & clean_data$height <= 220), ]

STEP 4: Calculate Missing “Weight” Variable

Using the formula:

# Calculate weight using the BMI and height columns and add it as a new variable
clean_data$weight <- clean_data$bmi * (clean_data$height / 100)^2

STEP 5: Describe Cleaned Data

Statistical summary and distribution of the newly calculated weight variable.

# Check the summary again to see how the cleaning process affected the data ranges
summary(clean_data)
##        X              bmi             age           alcohol         
##  Min.   :    1   Min.   : 7.60   Min.   : 50.00   Length:31779      
##  1st Qu.: 7994   1st Qu.:20.70   1st Qu.: 55.00   Class :character  
##  Median :15996   Median :23.78   Median : 62.00   Mode  :character  
##  Mean   :15998   Mean   :24.62   Mean   : 63.33                     
##  3rd Qu.:23996   3rd Qu.:27.39   3rd Qu.: 70.00                     
##  Max.   :31998   Max.   :69.86   Max.   :100.00                     
##                                                                     
##      height       depression           angina              sex           
##  Min.   :120     Length:31779       Length:31779       Length:31779      
##  1st Qu.:154     Class :character   Class :character   Class :character  
##  Median :160     Mode  :character   Mode  :character   Mode  :character  
##  Mean   :160                                                             
##  3rd Qu.:167                                                             
##  Max.   :210                                                             
##  NA's   :12264                                                           
##      county          income             shoesize         cars          
##  Min.   : 1.00   Min.   :        4   Min.   :22.59   Length:31779      
##  1st Qu.: 6.00   1st Qu.:    17337   1st Qu.:34.36   Class :character  
##  Median :11.00   Median :    26647   Median :36.69   Mode  :character  
##  Mean   :11.07   Mean   :    52783   Mean   :36.67                     
##  3rd Qu.:16.00   3rd Qu.:    47453   3rd Qu.:39.00                     
##  Max.   :21.00   Max.   :290151741   Max.   :51.61                     
##                                      NA's   :12264                     
##      weight      
##  Min.   : 18.41  
##  1st Qu.: 53.33  
##  Median : 62.44  
##  Mean   : 63.87  
##  3rd Qu.: 72.57  
##  Max.   :188.32  
##  NA's   :12264
# Visualize the distribution of the newly calculated weight variable
hist(clean_data$weight, 
     main = "Calculated Weight (Cleaned)", 
     xlab = "Weight (kg)", 
     col = "skyblue")

STEP 6: Analysis of Histograms

Comparison of Shoesize, Income, and Age distributions.

# Divide the screen into three sections (1 row, 3 columns) to compare three histograms at once
par(mfrow = c(1, 3))

# Plot Shoesize distribution to observe its shape
hist(clean_data$shoesize, main = "Shoesize Distribution", xlab = "Shoe Size", col = "lightgreen")

# Plot Income distribution to check for skewness
hist(clean_data$income, main = "Income Distribution", xlab = "Income", col = "gold")

# Plot Age distribution to see the spread of participants
hist(clean_data$age, main = "Age Distribution", xlab = "Age (Years)", col = "coral")

# Reset the plot area back to the default layout (1 row and 1 column)
par(mfrow = c(1, 1))

Characterization:

STEP 7: Scatter Plots and Patterns

Analyzing relationships between height, weight, and shoe size.

# Display two scatter plots side-by-side (1 row, 2 columns) for correlation analysis
par(mfrow = c(1, 2))

# 7a. Relation between height and weight: Use solid points (pch=19) and transparency (rgb)
plot(clean_data$height, clean_data$weight, main = "Height vs Weight", 
     xlab = "Height (cm)", ylab = "Weight (kg)", pch = 19, col = rgb(0, 0, 1, 0.2))

# 7b. Relation between height and shoe size: Observe patterns in discrete data
plot(clean_data$height, clean_data$shoesize, main = "Height vs Shoesize", 
     xlab = "Height (cm)", ylab = "Shoe Size", pch = 19, col = rgb(1, 0, 0, 0.2))

RESULT:

Results & Observations: