Exploratory Data Analysis (EDA) - Looking in to raw data

Basic overview

df <- read.xlsx("data.xlsx")
str(df)
## 'data.frame':    452 obs. of  60 variables:
##  $ Country                         : chr  "USA" "USA" "USA" "USA" ...
##  $ Region                          : chr  "GA" "NE" "SC" "TX" ...
##  $ DataYear                        : num  2022 2017 2020 2023 2014 ...
##  $ ClassGrade                      : num  12 10 12 12 12 8 11 12 10 12 ...
##  $ Gender                          : chr  "Male" "Male" "Female" "Female" ...
##  $ Ageyears                        : num  17 16 18 17 17 11 16 17 15 16 ...
##  $ Handed                          : chr  "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
##  $ Height_cm                       : chr  "182" "70" "5'3" "172" ...
##  $ Footlength_cm                   : chr  "33" "32" NA "26" ...
##  $ Armspan_cm                      : chr  "193" "170" NA "177" ...
##  $ Languages_spoken                : num  1 2 1 1 1 1 2 1 1.5 2 ...
##  $ Travel_to_School                : chr  "Car" "Car" "Car" "Car" ...
##  $ Travel_time_to_School           : chr  "10" "30" NA "7" ...
##  $ Reaction_time                   : num  0.324 0.34 3.061 0.328 0.501 ...
##  $ Score_in_memory_game            : chr  "37" "50" "39" "35" ...
##  $ Favourite_physical_activity     : chr  "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
##  $ Importance_reducing_pollution   : num  NA 900 299 650 800 572 900 450 200 791 ...
##  $ Importance_recycling_rubbish    : num  NA 900 1000 600 900 583 500 450 100 503 ...
##  $ Importance_conserving_water     : chr  NA "900" "326" "450" ...
##  $ Importance_saving_energy        : num  NA 800 487 500 800 695 300 450 750 856 ...
##  $ Importance_owning_computer      : num  NA 500 0 200 800 754 800 1000 900 1000 ...
##  $ Importance_Internet_access      : num  NA 600 1000 500 800 856 800 1000 900 1000 ...
##  $ Left_Footlength_cm              : chr  "30" "31" NA "26" ...
##  $ Longer_foot                     : chr  "Right foot" "Right foot" "Same length" "Right foot" ...
##  $ Index_Fingerlength_mm           : chr  "100" "40" NA "7" ...
##  $ Ring_Fingerlength_mm            : chr  "105" "50" NA "7" ...
##  $ Longer_Finger_Lefthand          : chr  "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
##  $ Birth_month                     : chr  "January" "September" "September" "February" ...
##  $ Favorite_Season                 : chr  "Winter" "Summer" "Spring" "Spring" ...
##  $ Allergies                       : chr  "No" "Yes" "No" "Yes" ...
##  $ Vegetarian                      : chr  "No" "No" "No" "No" ...
##  $ Favorite_Food                   : chr  "Meat" "Meat" "Seafood" "Poultry" ...
##  $ Beverage                        : chr  "Water" "Water" "Water" "Water" ...
##  $ Favorite_School_Subject         : chr  "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
##  $ Sleep_Hours_Schoolnight         : chr  "8" "7" NA "7" ...
##  $ Sleep_Hours_Non_Schoolnight     : chr  "6" "9" NA "9" ...
##  $ Home_Occupants                  : num  3 5 4 6 8 4 4 6 2 4 ...
##  $ Home_Internet_Access            : chr  "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
##  $ Communication_With_Friends      : chr  "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
##  $ Text_Messages_Sent_Yesterday    : chr  "260" "20" "4" "15" ...
##  $ Text_Messages_Received_Yesterday: chr  "274" "30" "2" "15" ...
##  $ Hanging_Out_With_Friends_Hours  : chr  "10" "48" NA "10" ...
##  $ Talking_On_Phone_Hours          : chr  "2" "2" NA "1" ...
##  $ Doing_Homework_Hours            : chr  "12" "2" NA "0" ...
##  $ Doing_Things_With_Family_Hours  : chr  "2" "48" NA "1" ...
##  $ Outdoor_Activities_Hours        : chr  "18" "14" NA "0" ...
##  $ Video_Games_Hours               : chr  "3" "2" NA "0" ...
##  $ Social_Websites_Hours           : chr  "1" "6" NA "3" ...
##  $ Texting_Messaging_Hours         : chr  "4" "1" NA "20" ...
##  $ Computer_Use_Hours              : chr  "10" "3" NA "50" ...
##  $ Watching_TV_Hours               : chr  "4" "20" NA "5" ...
##  $ Paid_Work_Hours                 : chr  "15" "0" NA "15" ...
##  $ Work_At_Home_Hours              : chr  "0.5" "4" NA "7" ...
##  $ Schoolwork_Pressure             : chr  "Very little" "Some" "A lot" "Some" ...
##  $ Planned_Education_Level         : chr  "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
##  $ Favorite_Music                  : chr  "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
##  $ Superpower                      : chr  "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
##  $ Preferred_Status                : chr  "Happy" "Happy" "Happy" "Happy" ...
##  $ Role_Model_Type                 : chr  "Business person" "Relative" "Relative" "Friend" ...
##  $ Charity_Donation                : chr  "Religious" "Health" "International aid" "International aid" ...
summary(df)
##    Country             Region             DataYear      ClassGrade   
##  Length:452         Length:452         Min.   :2010   Min.   : 4.00  
##  Class :character   Class :character   1st Qu.:2016   1st Qu.:10.00  
##  Mode  :character   Mode  :character   Median :2018   Median :12.00  
##                                        Mean   :2018   Mean   :10.54  
##                                        3rd Qu.:2021   3rd Qu.:12.00  
##                                        Max.   :2024   Max.   :12.00  
##                                                                      
##     Gender             Ageyears       Handed           Height_cm        
##  Length:452         Min.   :10.0   Length:452         Length:452        
##  Class :character   1st Qu.:15.0   Class :character   Class :character  
##  Mode  :character   Median :17.0   Mode  :character   Mode  :character  
##                     Mean   :16.1                                        
##                     3rd Qu.:17.0                                        
##                     Max.   :63.0                                        
##                     NA's   :1                                           
##  Footlength_cm       Armspan_cm        Languages_spoken Travel_to_School  
##  Length:452         Length:452         Min.   :1.000    Length:452        
##  Class :character   Class :character   1st Qu.:1.000    Class :character  
##  Mode  :character   Mode  :character   Median :1.000    Mode  :character  
##                                        Mean   :1.511                      
##                                        3rd Qu.:2.000                      
##                                        Max.   :7.000                      
##                                        NA's   :5                          
##  Travel_time_to_School Reaction_time       Score_in_memory_game
##  Length:452            Min.   :    0.071   Length:452          
##  Class :character      1st Qu.:    0.338   Class :character    
##  Mode  :character      Median :    0.400   Mode  :character    
##                        Mean   :   77.743                       
##                        3rd Qu.:    0.520                       
##                        Max.   :31402.000                       
##                        NA's   :16                              
##  Favourite_physical_activity Importance_reducing_pollution
##  Length:452                  Min.   :    0.0              
##  Class :character            1st Qu.:  500.0              
##  Mode  :character            Median :  743.0              
##                              Mean   :  707.9              
##                              3rd Qu.:  979.5              
##                              Max.   :10000.0              
##                              NA's   :21                   
##  Importance_recycling_rubbish Importance_conserving_water
##  Min.   :    0.0              Length:452                 
##  1st Qu.:  487.0              Class :character           
##  Median :  700.0              Mode  :character           
##  Mean   :  719.6                                         
##  3rd Qu.:  900.0                                         
##  Max.   :10000.0                                         
##  NA's   :21                                              
##  Importance_saving_energy Importance_owning_computer Importance_Internet_access
##  Min.   :    0.0          Min.   :   0.0             Min.   :    0.0           
##  1st Qu.:  403.0          1st Qu.: 200.0             1st Qu.:  500.0           
##  Median :  700.0          Median : 513.0             Median :  800.0           
##  Mean   :  678.9          Mean   : 557.4             Mean   :  778.3           
##  3rd Qu.:  900.0          3rd Qu.: 900.0             3rd Qu.: 1000.0           
##  Max.   :10000.0          Max.   :7000.0             Max.   :10000.0           
##  NA's   :46               NA's   :21                 NA's   :21                
##  Left_Footlength_cm Longer_foot        Index_Fingerlength_mm
##  Length:452         Length:452         Length:452           
##  Class :character   Class :character   Class :character     
##  Mode  :character   Mode  :character   Mode  :character     
##                                                             
##                                                             
##                                                             
##                                                             
##  Ring_Fingerlength_mm Longer_Finger_Lefthand Birth_month       
##  Length:452           Length:452             Length:452        
##  Class :character     Class :character       Class :character  
##  Mode  :character     Mode  :character       Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##  Favorite_Season     Allergies          Vegetarian        Favorite_Food     
##  Length:452         Length:452         Length:452         Length:452        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Beverage         Favorite_School_Subject Sleep_Hours_Schoolnight
##  Length:452         Length:452              Length:452             
##  Class :character   Class :character        Class :character       
##  Mode  :character   Mode  :character        Mode  :character       
##                                                                    
##                                                                    
##                                                                    
##                                                                    
##  Sleep_Hours_Non_Schoolnight Home_Occupants    Home_Internet_Access
##  Length:452                  Min.   :    1.0   Length:452          
##  Class :character            1st Qu.:    3.0   Class :character    
##  Mode  :character            Median :    4.0   Mode  :character    
##                              Mean   :  108.1                       
##                              3rd Qu.:    5.0                       
##                              Max.   :45752.0                       
##                              NA's   :11                            
##  Communication_With_Friends Text_Messages_Sent_Yesterday
##  Length:452                 Length:452                  
##  Class :character           Class :character            
##  Mode  :character           Mode  :character            
##                                                         
##                                                         
##                                                         
##                                                         
##  Text_Messages_Received_Yesterday Hanging_Out_With_Friends_Hours
##  Length:452                       Length:452                    
##  Class :character                 Class :character              
##  Mode  :character                 Mode  :character              
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Talking_On_Phone_Hours Doing_Homework_Hours Doing_Things_With_Family_Hours
##  Length:452             Length:452           Length:452                    
##  Class :character       Class :character     Class :character              
##  Mode  :character       Mode  :character     Mode  :character              
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  Outdoor_Activities_Hours Video_Games_Hours  Social_Websites_Hours
##  Length:452               Length:452         Length:452           
##  Class :character         Class :character   Class :character     
##  Mode  :character         Mode  :character   Mode  :character     
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  Texting_Messaging_Hours Computer_Use_Hours Watching_TV_Hours 
##  Length:452              Length:452         Length:452        
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##  Paid_Work_Hours    Work_At_Home_Hours Schoolwork_Pressure
##  Length:452         Length:452         Length:452         
##  Class :character   Class :character   Class :character   
##  Mode  :character   Mode  :character   Mode  :character   
##                                                           
##                                                           
##                                                           
##                                                           
##  Planned_Education_Level Favorite_Music      Superpower       
##  Length:452              Length:452         Length:452        
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##                                                               
##  Preferred_Status   Role_Model_Type    Charity_Donation  
##  Length:452         Length:452         Length:452        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
num_vars <- sum(sapply(df, is.numeric))


cat_vars <- sum(sapply(df, is.factor)) + 
             sum(sapply(df, is.character))


cat("Numerical variables:", num_vars, "\n")
## Numerical variables: 11
cat("Categorical variables:", cat_vars, "\n")
## Categorical variables: 49

Missing values

options(repr.plot.width = 20, repr.plot.height = 30)
gg_miss_var(df)

missing_percent <- colMeans(is.na(df)) * 100
sort(missing_percent)
##                          Country                           Region 
##                        0.0000000                        0.0000000 
##                         DataYear                       ClassGrade 
##                        0.0000000                        0.0000000 
##                           Gender                         Ageyears 
##                        0.2212389                        0.2212389 
##                           Handed                 Travel_to_School 
##                        0.2212389                        0.2212389 
##      Favourite_physical_activity                  Favorite_Season 
##                        0.2212389                        0.4424779 
##                      Birth_month                    Favorite_Food 
##                        0.6637168                        0.6637168 
##                         Beverage                 Languages_spoken 
##                        0.8849558                        1.1061947 
##            Travel_time_to_School                        Allergies 
##                        1.1061947                        1.1061947 
##          Favorite_School_Subject                       Vegetarian 
##                        1.1061947                        1.5486726 
##          Sleep_Hours_Schoolnight      Sleep_Hours_Non_Schoolnight 
##                        1.9911504                        1.9911504 
##                   Home_Occupants             Home_Internet_Access 
##                        2.4336283                        2.6548673 
##                        Height_cm     Text_Messages_Sent_Yesterday 
##                        2.8761062                        3.0973451 
## Text_Messages_Received_Yesterday                    Reaction_time 
##                        3.0973451                        3.5398230 
##                      Longer_foot               Left_Footlength_cm 
##                        3.5398230                        4.2035398 
##           Longer_Finger_Lefthand       Communication_With_Friends 
##                        4.2035398                        4.2035398 
##                    Footlength_cm    Importance_reducing_pollution 
##                        4.4247788                        4.6460177 
##     Importance_recycling_rubbish      Importance_conserving_water 
##                        4.6460177                        4.6460177 
##       Importance_owning_computer       Importance_Internet_access 
##                        4.6460177                        4.6460177 
##             Score_in_memory_game   Hanging_Out_With_Friends_Hours 
##                        4.8672566                        4.8672566 
##           Talking_On_Phone_Hours             Doing_Homework_Hours 
##                        5.9734513                        5.9734513 
##   Doing_Things_With_Family_Hours         Outdoor_Activities_Hours 
##                        6.6371681                        6.6371681 
##              Schoolwork_Pressure             Ring_Fingerlength_mm 
##                        6.8584071                        7.0796460 
##            Social_Websites_Hours          Planned_Education_Level 
##                        7.5221239                        7.5221239 
##                 Preferred_Status                 Charity_Donation 
##                        7.5221239                        7.5221239 
##                Video_Games_Hours                       Superpower 
##                        7.7433628                        7.7433628 
##               Work_At_Home_Hours                  Role_Model_Type 
##                        7.9646018                        7.9646018 
##            Index_Fingerlength_mm          Texting_Messaging_Hours 
##                        8.1858407                        8.1858407 
##               Computer_Use_Hours                Watching_TV_Hours 
##                        8.4070796                        8.6283186 
##                   Favorite_Music                       Armspan_cm 
##                        8.6283186                        8.8495575 
##                  Paid_Work_Hours         Importance_saving_energy 
##                        8.8495575                       10.1769912
colnames(df)
##  [1] "Country"                          "Region"                          
##  [3] "DataYear"                         "ClassGrade"                      
##  [5] "Gender"                           "Ageyears"                        
##  [7] "Handed"                           "Height_cm"                       
##  [9] "Footlength_cm"                    "Armspan_cm"                      
## [11] "Languages_spoken"                 "Travel_to_School"                
## [13] "Travel_time_to_School"            "Reaction_time"                   
## [15] "Score_in_memory_game"             "Favourite_physical_activity"     
## [17] "Importance_reducing_pollution"    "Importance_recycling_rubbish"    
## [19] "Importance_conserving_water"      "Importance_saving_energy"        
## [21] "Importance_owning_computer"       "Importance_Internet_access"      
## [23] "Left_Footlength_cm"               "Longer_foot"                     
## [25] "Index_Fingerlength_mm"            "Ring_Fingerlength_mm"            
## [27] "Longer_Finger_Lefthand"           "Birth_month"                     
## [29] "Favorite_Season"                  "Allergies"                       
## [31] "Vegetarian"                       "Favorite_Food"                   
## [33] "Beverage"                         "Favorite_School_Subject"         
## [35] "Sleep_Hours_Schoolnight"          "Sleep_Hours_Non_Schoolnight"     
## [37] "Home_Occupants"                   "Home_Internet_Access"            
## [39] "Communication_With_Friends"       "Text_Messages_Sent_Yesterday"    
## [41] "Text_Messages_Received_Yesterday" "Hanging_Out_With_Friends_Hours"  
## [43] "Talking_On_Phone_Hours"           "Doing_Homework_Hours"            
## [45] "Doing_Things_With_Family_Hours"   "Outdoor_Activities_Hours"        
## [47] "Video_Games_Hours"                "Social_Websites_Hours"           
## [49] "Texting_Messaging_Hours"          "Computer_Use_Hours"              
## [51] "Watching_TV_Hours"                "Paid_Work_Hours"                 
## [53] "Work_At_Home_Hours"               "Schoolwork_Pressure"             
## [55] "Planned_Education_Level"          "Favorite_Music"                  
## [57] "Superpower"                       "Preferred_Status"                
## [59] "Role_Model_Type"                  "Charity_Donation"

Numerical Data Handling

Checking inconsistencies and missing values in numerical variables

# Numerical columns to check

numerical_columns <- c("ClassGrade", "Ageyears", "Height_cm" ,"Footlength_cm","Armspan_cm"  ,"Languages_spoken","Travel_time_to_School" ,"Reaction_time" ,"Score_in_memory_game"    ,"Importance_reducing_pollution" ,"Importance_recycling_rubbish" ,"Importance_conserving_water","Importance_saving_energy" ,"Importance_owning_computer"  ,"Importance_Internet_access" , "Left_Footlength_cm" ,"Index_Fingerlength_mm","Ring_Fingerlength_mm"  ,"Sleep_Hours_Schoolnight","Sleep_Hours_Non_Schoolnight","Home_Occupants","Text_Messages_Sent_Yesterday", "Text_Messages_Received_Yesterday","Hanging_Out_With_Friends_Hours" ,"Talking_On_Phone_Hours","Doing_Homework_Hours" ,"Doing_Things_With_Family_Hours","Outdoor_Activities_Hours","Video_Games_Hours","Social_Websites_Hours"  , "Texting_Messaging_Hours" , "Computer_Use_Hours" , "Watching_TV_Hours","Paid_Work_Hours" ,"Work_At_Home_Hours")
check_categorical_in_numerical <- function(df, numerical_columns) {
  for (col in numerical_columns) {
    # Convert the column to character and get unique values
    unique_values <- unique(as.character(df[[col]]))
    
    # Identify non-numeric values
    non_numeric_values <- unique_values[is.na(as.numeric(unique_values))]
    
    # Count missing values (NA)
    missing_values_count <- sum(is.na(df[[col]]))
    
    # Print results
    cat("Column '", col, "':\n", sep = "")
    if (length(non_numeric_values) > 0) {
      cat("  - Contains categorical values:\n")
      print(non_numeric_values)
    } else {
      cat("  - Contains only numeric values.\n")
    }
    cat("  - Number of missing values (NA):", missing_values_count, "\n\n")
  }
}
check_categorical_in_numerical(df, numerical_columns)
## Column 'ClassGrade':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Ageyears':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 1
## Column 'Height_cm':
##   - Contains categorical values:
##  [1] "5'3"    "5'11"   "4”10"   "5'2"    NA       "4'9"    "5'4"    "5,8"   
##  [9] "4'10"   "5’2”"   "5'6"    "177cm"  "5'5"    "5´3"    "5'3\""  "5' 3\""
## [17] "158cm"  "157cm"  "172cm"  "5' 11" 
##   - Number of missing values (NA): 13
## Column 'Footlength_cm':
##   - Contains categorical values:
## [1] NA     "29cm" "24cm" "20cm" "31cm" "12cm"
##   - Number of missing values (NA): 20
## Column 'Armspan_cm':
##   - Contains categorical values:
## [1] NA      "56cm"  "189cm" "160cm" "165cm" "71cm" 
##   - Number of missing values (NA): 40 
## 
## Column 'Languages_spoken':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 5
## Column 'Travel_time_to_School':
##   - Contains categorical values:
##  [1] NA      "3 min" "7 ish" "21 m"  "1hr"   "5 min" "12min" "Oof"   "14min"
## [10] "20-25" "1hour"
##   - Number of missing values (NA): 5 
## 
## Column 'Reaction_time':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 16
## Column 'Score_in_memory_game':
##   - Contains categorical values:
## [1] NA     "52 s"
##   - Number of missing values (NA): 22 
## 
## Column 'Importance_reducing_pollution':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 21 
## 
## Column 'Importance_recycling_rubbish':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 21
## Column 'Importance_conserving_water':
##   - Contains categorical values:
## [1] NA  "?"
##   - Number of missing values (NA): 21 
## 
## Column 'Importance_saving_energy':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 46 
## 
## Column 'Importance_owning_computer':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 21 
## 
## Column 'Importance_Internet_access':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 21
## Column 'Left_Footlength_cm':
##   - Contains categorical values:
## [1] NA     "8cm"  "12cm"
##   - Number of missing values (NA): 19
## Column 'Index_Fingerlength_mm':
##   - Contains categorical values:
## [1] NA     "30mm" "idk"  "8cm"  "80mm" "70mm" "3'4" 
##   - Number of missing values (NA): 37
## Column 'Ring_Fingerlength_mm':
##   - Contains categorical values:
## [1] NA      "25mm"  "idk"   "7cm"   "60 mm" "3'2"  
##   - Number of missing values (NA): 32
## Column 'Sleep_Hours_Schoolnight':
##   - Contains categorical values:
## [1] NA      "6hr"   "6hour" "7 1/2"
##   - Number of missing values (NA): 9
## Column 'Sleep_Hours_Non_Schoolnight':
##   - Contains categorical values:
## [1] NA      "6~7"   "less"  "yes"   "8hour"
##   - Number of missing values (NA): 9 
## 
## Column 'Home_Occupants':
##   - Contains categorical values:
## [1] NA
##   - Number of missing values (NA): 11
## Column 'Text_Messages_Sent_Yesterday':
##   - Contains categorical values:
##  [1] NA      "a lot" "40ish" "A lot" "200`"  "yes"   "n/a"   "alot"  "30-40"
## [10] "50ish" "30-50" "Idk"  
##   - Number of missing values (NA): 14
## Column 'Text_Messages_Received_Yesterday':
##   - Contains categorical values:
##  [1] NA      "5~6"   "1000s" "idk"   "35ish" "A lot" "Idk"   "n/a"   "alot" 
## [10] "30-40" "40-60" "?"    
##   - Number of missing values (NA): 14
## Column 'Hanging_Out_With_Friends_Hours':
##   - Contains categorical values:
## [1] NA      "5~6hr" "4hr"   "4hour" "2hrs"  "Idk"  
##   - Number of missing values (NA): 22
## Column 'Talking_On_Phone_Hours':
##   - Contains categorical values:
## [1] NA      "2hr"   "<1"    "1hr"   "5 %"   "yes"   "2hour" "8mins" "Idk"  
##   - Number of missing values (NA): 27
## Column 'Doing_Homework_Hours':
##   - Contains categorical values:
## [1] NA      "2 hr"  "1hr"   "24/7"  "2hour" "2hr"   "<1"    "Idk"  
##   - Number of missing values (NA): 27
## Column 'Doing_Things_With_Family_Hours':
##   - Contains categorical values:
## [1] NA      "1 hr"  "6hr"   "24/7"  "5hour" "6hrs"  "Idk"  
##   - Number of missing values (NA): 30
## Column 'Outdoor_Activities_Hours':
##   - Contains categorical values:
## [1] NA      "1 hr"  "30min" "1hour" "6hrs"  "Idk"   "P"    
##   - Number of missing values (NA): 30
## Column 'Video_Games_Hours':
##   - Contains categorical values:
## [1] NA      "1 hr"  "15min" "yes"   "10min" "7hrs"  "Idk"  
##   - Number of missing values (NA): 35
## Column 'Social_Websites_Hours':
##   - Contains categorical values:
## [1] NA      "1 hr"  "5hr"   "24/7"  "1hour" "14hrs" "<1"    "Idk"  
##   - Number of missing values (NA): 34
## Column 'Texting_Messaging_Hours':
##   - Contains categorical values:
## [1] NA      "2 hr"  "30min" "24/7"  "15min" "20hrs" "Idk"  
##   - Number of missing values (NA): 37
## Column 'Computer_Use_Hours':
##   - Contains categorical values:
## [1] NA      "24/7"  "1 hr"  "4hr"   "yes"   ";"     "1hour" "3hrs"  "Idk"  
##   - Number of missing values (NA): 38
## Column 'Watching_TV_Hours':
##   - Contains categorical values:
## [1] NA      "24/7"  "2 hr"  "<1"    "3hr"   "yes"   "30min" "1hr"   "Idk"  
##   - Number of missing values (NA): 39
## Column 'Paid_Work_Hours':
##   - Contains categorical values:
## [1] NA      "8hr"   "n/a"   "7hour" "<1"    "35-40" "Idk"  
##   - Number of missing values (NA): 40
## Column 'Work_At_Home_Hours':
##   - Contains categorical values:
## [1] NA      "1 hr"  "30min" "n/a"   "6hour" "20hrs" "<1"    "Idk"   "3 1/2"
##   - Number of missing values (NA): 36
num_data <- df %>%
  select(numerical_columns)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(numerical_columns)
## 
##   # Now:
##   data %>% select(all_of(numerical_columns))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gg_miss_var(num_data)

vis_miss(num_data)

Outliers in numerical variables

ggplot(df, aes(y = Ageyears)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Ageyears", y = "Values")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

  • Some have outliers
ggplot(df, aes(y = Height_cm)) +
  geom_boxplot(fill = "darkseagreen") +
  labs(title = "Boxplot of Height_cm", y = "Values")

unique(df$Height_cm)
##   [1] "182"                "70"                 "5'3"               
##   [4] "172"                "165"                "155"               
##   [7] "169"                "160"                "190"               
##  [10] "167"                "5'11"               "170.2"             
##  [13] "149"                "167.6"              "175"               
##  [16] "175.2"              "154"                "4”10"              
##  [19] "183"                "4.1100000000000003" "181"               
##  [22] "152.4"              "145"                "180"               
##  [25] "5'2"                "164"                "162"               
##  [28] "157"                "182.9"              "176"               
##  [31] "163"                "170"                "178"               
##  [34] "158"                "161"                NA                  
##  [37] "143"                "154.9"              "176.5"             
##  [40] "132"                "182.8"              "174"               
##  [43] "173"                "152"                "157.5"             
##  [46] "63"                 "4'9"                "5'4"               
##  [49] "150.30000000000001" "62"                 "193"               
##  [52] "177.8"              "5,8"                "150"               
##  [55] "156"                "159"                "153"               
##  [58] "187.9"              "175.3"              "177"               
##  [61] "124"                "4'10"               "165.1"             
##  [64] "187"                "147"                "163.4"             
##  [67] "150.5"              "181.6"              "5"                 
##  [70] "146"                "168"                "134"               
##  [73] "191.5"              "146.5"              "158.19999999999999"
##  [76] "166"                "184"                "189.2"             
##  [79] "187.5"              "68"                 "149.9"             
##  [82] "5’2”"               "200"                "171"               
##  [85] "5'6"                "320"                "129.5"             
##  [88] "13.3"               "172.7"              "192"               
##  [91] "144"                "177cm"              "74"                
##  [94] "45788"              "5'5"                "196"               
##  [97] "120"                "6"                  "190.5"             
## [100] "5´3"                "172.5"              "5'3\""             
## [103] "154.30000000000001" "179"                "180.3"             
## [106] "67"                 "170.1"              "157.4"             
## [109] "169.5"              "176.7"              "5' 3\""            
## [112] "158cm"              "56"                 "157cm"             
## [115] "160.19999999999999" "151"                "5.7"               
## [118] "138"                "61"                 "148"               
## [121] "170.5"              "162.5"              "5.3"               
## [124] "164.5"              "141"                "64"                
## [127] "142"                "52"                 "134.9"             
## [130] "139"                "5.6"                "22.86"             
## [133] "172cm"              "185"                "188"               
## [136] "156.19999999999999" "12"                 "65"                
## [139] "5' 11"              "69"                 "64.400000000000006"

_ Some variables have inconsistencies

Handling inconsistencies with numerical coloums

# Define function to convert height format to numeric
convert_height <- function(height) {
  height <- tolower(height)  # Convert to lowercase for consistency
  
  # Handle different formats
  height <- str_replace_all(height, "cm", "")  # Remove 'cm'
  height <- str_replace_all(height, "[^0-9.]", " ")  # Keep only numbers and periods
  
  # Convert to numeric, suppress warnings
  as.numeric(str_trim(height))
}

# Function to convert time responses to numeric
convert_time <- function(time) {
  time <- tolower(time)  # Convert to lowercase
  time <- str_replace_all(time, "min", "")  # Remove 'min'
  time <- str_replace_all(time, "hour", "")  # Remove 'hour'
  time <- str_replace_all(time, "[^0-9.]", " ")  # Keep only numbers
  as.numeric(str_trim(time))
}

# Function to clean categorical responses in numeric columns
clean_numeric_column <- function(column) {
  column <- tolower(column)  # Convert to lowercase
  column <- str_replace_all(column, "[^0-9.]", " ")  # Remove non-numeric characters
  column <- as.numeric(str_trim(column))  # Convert to numeric
  return(column)
}

# Apply cleaning functions to specific columns
df_new <- df %>%
  mutate(
    Height_cm = convert_height(Height_cm),
    Footlength_cm = convert_height(Footlength_cm),
    Armspan_cm = convert_height(Armspan_cm),
    Travel_time_to_School = convert_time(Travel_time_to_School),
    Reaction_time = clean_numeric_column(Reaction_time),
    Score_in_memory_game = clean_numeric_column(Score_in_memory_game),
    Importance_reducing_pollution = clean_numeric_column(Importance_reducing_pollution),
    Importance_recycling_rubbish = clean_numeric_column(Importance_recycling_rubbish),
    Importance_conserving_water = clean_numeric_column(Importance_conserving_water),
    Importance_saving_energy = clean_numeric_column(Importance_saving_energy),
    Importance_owning_computer = clean_numeric_column(Importance_owning_computer),
    Importance_Internet_access = clean_numeric_column(Importance_Internet_access),
    Left_Footlength_cm = convert_height(Left_Footlength_cm),
    Index_Fingerlength_mm = clean_numeric_column(Index_Fingerlength_mm),
    Ring_Fingerlength_mm = clean_numeric_column(Ring_Fingerlength_mm),
    Sleep_Hours_Schoolnight = clean_numeric_column(Sleep_Hours_Schoolnight),
    Sleep_Hours_Non_Schoolnight = clean_numeric_column(Sleep_Hours_Non_Schoolnight),
    Home_Occupants = clean_numeric_column(Home_Occupants),
    Text_Messages_Sent_Yesterday = clean_numeric_column(Text_Messages_Sent_Yesterday),
    Text_Messages_Received_Yesterday = clean_numeric_column(Text_Messages_Received_Yesterday),
    Hanging_Out_With_Friends_Hours = clean_numeric_column(Hanging_Out_With_Friends_Hours),
    Talking_On_Phone_Hours = clean_numeric_column(Talking_On_Phone_Hours),
    Doing_Homework_Hours = clean_numeric_column(Doing_Homework_Hours),
    Doing_Things_With_Family_Hours = clean_numeric_column(Doing_Things_With_Family_Hours),
    Outdoor_Activities_Hours = clean_numeric_column(Outdoor_Activities_Hours),
    Video_Games_Hours = clean_numeric_column(Video_Games_Hours),
    Social_Websites_Hours = clean_numeric_column(Social_Websites_Hours),
    Texting_Messaging_Hours = clean_numeric_column(Texting_Messaging_Hours),
    Computer_Use_Hours = clean_numeric_column(Computer_Use_Hours),
    Watching_TV_Hours = clean_numeric_column(Watching_TV_Hours),
    Paid_Work_Hours = clean_numeric_column(Paid_Work_Hours),
    Work_At_Home_Hours = clean_numeric_column(Work_At_Home_Hours)
  )

# Print summary to check if the columns are now numeric
str(df_new)
## 'data.frame':    452 obs. of  60 variables:
##  $ Country                         : chr  "USA" "USA" "USA" "USA" ...
##  $ Region                          : chr  "GA" "NE" "SC" "TX" ...
##  $ DataYear                        : num  2022 2017 2020 2023 2014 ...
##  $ ClassGrade                      : num  12 10 12 12 12 8 11 12 10 12 ...
##  $ Gender                          : chr  "Male" "Male" "Female" "Female" ...
##  $ Ageyears                        : num  17 16 18 17 17 11 16 17 15 16 ...
##  $ Handed                          : chr  "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
##  $ Height_cm                       : num  182 70 NA 172 165 155 169 160 190 167 ...
##  $ Footlength_cm                   : num  33 32 NA 26 25 25.6 24 22 30 23 ...
##  $ Armspan_cm                      : num  193 170 NA 177 168 155 50 159 192 69 ...
##  $ Languages_spoken                : num  1 2 1 1 1 1 2 1 1.5 2 ...
##  $ Travel_to_School                : chr  "Car" "Car" "Car" "Car" ...
##  $ Travel_time_to_School           : num  10 30 NA 7 10 15 10 7 10 7 ...
##  $ Reaction_time                   : num  0.324 0.34 3.061 0.328 0.501 ...
##  $ Score_in_memory_game            : num  37 50 39 35 40 61 32 33 30 30 ...
##  $ Favourite_physical_activity     : chr  "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
##  $ Importance_reducing_pollution   : num  NA 900 299 650 800 572 900 450 200 791 ...
##  $ Importance_recycling_rubbish    : num  NA 900 1000 600 900 583 500 450 100 503 ...
##  $ Importance_conserving_water     : num  NA 900 326 450 850 620 300 450 400 834 ...
##  $ Importance_saving_energy        : num  NA 800 487 500 800 695 300 450 750 856 ...
##  $ Importance_owning_computer      : num  NA 500 0 200 800 754 800 1000 900 1000 ...
##  $ Importance_Internet_access      : num  NA 600 1000 500 800 856 800 1000 900 1000 ...
##  $ Left_Footlength_cm              : num  30 31 NA 26 25 24.2 24 22 26 23 ...
##  $ Longer_foot                     : chr  "Right foot" "Right foot" "Same length" "Right foot" ...
##  $ Index_Fingerlength_mm           : num  100 40 NA 7 75 67 77 80 82 70 ...
##  $ Ring_Fingerlength_mm            : num  105 50 NA 7 75 67 69 78 80 80 ...
##  $ Longer_Finger_Lefthand          : chr  "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
##  $ Birth_month                     : chr  "January" "September" "September" "February" ...
##  $ Favorite_Season                 : chr  "Winter" "Summer" "Spring" "Spring" ...
##  $ Allergies                       : chr  "No" "Yes" "No" "Yes" ...
##  $ Vegetarian                      : chr  "No" "No" "No" "No" ...
##  $ Favorite_Food                   : chr  "Meat" "Meat" "Seafood" "Poultry" ...
##  $ Beverage                        : chr  "Water" "Water" "Water" "Water" ...
##  $ Favorite_School_Subject         : chr  "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
##  $ Sleep_Hours_Schoolnight         : num  8 7 NA 7 8 7 6 6.5 6 6 ...
##  $ Sleep_Hours_Non_Schoolnight     : num  6 9 NA 9 11 9 8 9 11 6 ...
##  $ Home_Occupants                  : num  3 5 4 6 8 4 4 6 2 4 ...
##  $ Home_Internet_Access            : chr  "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
##  $ Communication_With_Friends      : chr  "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
##  $ Text_Messages_Sent_Yesterday    : num  260 20 4 15 7 0 1000 4 20 10 ...
##  $ Text_Messages_Received_Yesterday: num  274 30 2 15 15 ...
##  $ Hanging_Out_With_Friends_Hours  : num  10 48 NA 10 2 10 20 2 10 35 ...
##  $ Talking_On_Phone_Hours          : num  2 2 NA 1 1 2 20 0 1 2 ...
##  $ Doing_Homework_Hours            : num  12 2 NA 0 9 20 10 6 4 10 ...
##  $ Doing_Things_With_Family_Hours  : num  2 48 NA 1 7 35 0 10 20 24 ...
##  $ Outdoor_Activities_Hours        : num  18 14 NA 0 11 27 0 0 20 12 ...
##  $ Video_Games_Hours               : num  3 2 NA 0 0 30 25 14 15 20 ...
##  $ Social_Websites_Hours           : num  1 6 NA 3 20 0 16 7 15 4 ...
##  $ Texting_Messaging_Hours         : num  4 1 NA 20 1 1 24 2 1 3 ...
##  $ Computer_Use_Hours              : num  10 3 NA 50 21 25 30 21 25 39 ...
##  $ Watching_TV_Hours               : num  4 20 NA 5 11 7 5 14 5 4 ...
##  $ Paid_Work_Hours                 : num  15 0 NA 15 15 0 4 0 0 0 ...
##  $ Work_At_Home_Hours              : num  0.5 4 NA 7 2 0 2 6 4 0 ...
##  $ Schoolwork_Pressure             : chr  "Very little" "Some" "A lot" "Some" ...
##  $ Planned_Education_Level         : chr  "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
##  $ Favorite_Music                  : chr  "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
##  $ Superpower                      : chr  "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
##  $ Preferred_Status                : chr  "Happy" "Happy" "Happy" "Happy" ...
##  $ Role_Model_Type                 : chr  "Business person" "Relative" "Relative" "Friend" ...
##  $ Charity_Donation                : chr  "Religious" "Health" "International aid" "International aid" ...
summary(df_new)
##    Country             Region             DataYear      ClassGrade   
##  Length:452         Length:452         Min.   :2010   Min.   : 4.00  
##  Class :character   Class :character   1st Qu.:2016   1st Qu.:10.00  
##  Mode  :character   Mode  :character   Median :2018   Median :12.00  
##                                        Mean   :2018   Mean   :10.54  
##                                        3rd Qu.:2021   3rd Qu.:12.00  
##                                        Max.   :2024   Max.   :12.00  
##                                                                      
##     Gender             Ageyears       Handed            Height_cm       
##  Length:452         Min.   :10.0   Length:452         Min.   :    4.11  
##  Class :character   1st Qu.:15.0   Class :character   1st Qu.:  155.25  
##  Mode  :character   Median :17.0   Mode  :character   Median :  165.10  
##                     Mean   :16.1                      Mean   :  267.83  
##                     3rd Qu.:17.0                      3rd Qu.:  176.00  
##                     Max.   :63.0                      Max.   :45788.00  
##                     NA's   :1                         NA's   :30        
##  Footlength_cm      Armspan_cm    Languages_spoken Travel_to_School  
##  Min.   :  0.25   Min.   :  1.2   Min.   :1.000    Length:452        
##  1st Qu.: 22.77   1st Qu.:150.0   1st Qu.:1.000    Class :character  
##  Median : 24.00   Median :163.0   Median :1.000    Mode  :character  
##  Mean   : 27.22   Mean   :152.0   Mean   :1.511                      
##  3rd Qu.: 26.70   3rd Qu.:175.3   3rd Qu.:2.000                      
##  Max.   :274.00   Max.   :416.2   Max.   :7.000                      
##  NA's   :20       NA's   :40      NA's   :5                          
##  Travel_time_to_School Reaction_time       Score_in_memory_game
##  Min.   :    0.0       Min.   :    0.071   Min.   :  0.359     
##  1st Qu.:    7.0       1st Qu.:    0.338   1st Qu.: 36.000     
##  Median :   10.0       Median :    0.400   Median : 42.000     
##  Mean   :  220.6       Mean   :   77.743   Mean   : 43.743     
##  3rd Qu.:   20.0       3rd Qu.:    0.520   3rd Qu.: 49.750     
##  Max.   :45787.0       Max.   :31402.000   Max.   :121.000     
##  NA's   :7             NA's   :16          NA's   :22          
##  Favourite_physical_activity Importance_reducing_pollution
##  Length:452                  Min.   :    0.0              
##  Class :character            1st Qu.:  500.0              
##  Mode  :character            Median :  743.0              
##                              Mean   :  707.9              
##                              3rd Qu.:  979.5              
##                              Max.   :10000.0              
##                              NA's   :21                   
##  Importance_recycling_rubbish Importance_conserving_water
##  Min.   :    0.0              Min.   :    0.0            
##  1st Qu.:  487.0              1st Qu.:  400.0            
##  Median :  700.0              Median :  645.0            
##  Mean   :  719.6              Mean   :  636.5            
##  3rd Qu.:  900.0              3rd Qu.:  900.0            
##  Max.   :10000.0              Max.   :10000.0            
##  NA's   :21                   NA's   :22                 
##  Importance_saving_energy Importance_owning_computer Importance_Internet_access
##  Min.   :    0.0          Min.   :   0.0             Min.   :    0.0           
##  1st Qu.:  403.0          1st Qu.: 200.0             1st Qu.:  500.0           
##  Median :  700.0          Median : 513.0             Median :  800.0           
##  Mean   :  678.9          Mean   : 557.4             Mean   :  778.3           
##  3rd Qu.:  900.0          3rd Qu.: 900.0             3rd Qu.: 1000.0           
##  Max.   :10000.0          Max.   :7000.0             Max.   :10000.0           
##  NA's   :46               NA's   :21                 NA's   :21                
##  Left_Footlength_cm Longer_foot        Index_Fingerlength_mm
##  Min.   :  0.24     Length:452         Min.   :   0.1016    
##  1st Qu.: 22.50     Class :character   1st Qu.:  18.8000    
##  Median : 24.00     Mode  :character   Median :  70.5000    
##  Mean   : 26.93                        Mean   :  72.1799    
##  3rd Qu.: 26.00                        3rd Qu.:  80.0000    
##  Max.   :335.00                        Max.   :1500.0000    
##  NA's   :19                            NA's   :39           
##  Ring_Fingerlength_mm Longer_Finger_Lefthand Birth_month       
##  Min.   :   0.1016    Length:452             Length:452        
##  1st Qu.:  20.2750    Class :character       Class :character  
##  Median :  70.7500    Mode  :character       Mode  :character  
##  Mean   :  72.7494                                             
##  3rd Qu.:  80.0000                                             
##  Max.   :1700.0000                                             
##  NA's   :34                                                    
##  Favorite_Season     Allergies          Vegetarian        Favorite_Food     
##  Length:452         Length:452         Length:452         Length:452        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Beverage         Favorite_School_Subject Sleep_Hours_Schoolnight
##  Length:452         Length:452              Min.   :    2          
##  Class :character   Class :character        1st Qu.:    6          
##  Mode  :character   Mode  :character        Median :    7          
##                                             Mean   : 1043          
##                                             3rd Qu.:    8          
##                                             Max.   :45847          
##                                             NA's   :10             
##  Sleep_Hours_Non_Schoolnight Home_Occupants    Home_Internet_Access
##  Min.   :    1               Min.   :    1.0   Length:452          
##  1st Qu.:    8               1st Qu.:    3.0   Class :character    
##  Median :    9               Median :    4.0   Mode  :character    
##  Mean   : 1156               Mean   :  108.1                       
##  3rd Qu.:   10               3rd Qu.:    5.0                       
##  Max.   :46004               Max.   :45752.0                       
##  NA's   :12                  NA's   :11                            
##  Communication_With_Friends Text_Messages_Sent_Yesterday
##  Length:452                 Min.   :   0.00             
##  Class :character           1st Qu.:   7.25             
##  Mode  :character           Median :  30.00             
##                             Mean   :  83.74             
##                             3rd Qu.: 100.00             
##                             Max.   :2000.00             
##                             NA's   :22                  
##  Text_Messages_Received_Yesterday Hanging_Out_With_Friends_Hours
##  Min.   :    0.00                 Min.   :    0.0               
##  1st Qu.:   10.75                 1st Qu.:    4.0               
##  Median :   35.00                 Median :    9.5               
##  Mean   :  133.50                 Mean   :  228.2               
##  3rd Qu.:  108.25                 3rd Qu.:   18.0               
##  Max.   :10000.00                 Max.   :45846.0               
##  NA's   :24                       NA's   :24                    
##  Talking_On_Phone_Hours Doing_Homework_Hours Doing_Things_With_Family_Hours
##  Min.   :    0.0        Min.   :    0.0      Min.   :    0.0               
##  1st Qu.:    1.0        1st Qu.:    2.0      1st Qu.:    3.0               
##  Median :    1.5        Median :    6.0      Median :    5.0               
##  Mean   :  113.6        Mean   :  117.4      Mean   :  228.4               
##  3rd Qu.:    4.0        3rd Qu.:   10.0      3rd Qu.:   12.0               
##  Max.   :45878.0        Max.   :45721.0      Max.   :45910.0               
##  NA's   :29             NA's   :29           NA's   :32                    
##  Outdoor_Activities_Hours Video_Games_Hours Social_Websites_Hours
##  Min.   :    0.00         Min.   :  0.000   Min.   :  0.00       
##  1st Qu.:    2.00         1st Qu.:  0.000   1st Qu.:  2.00       
##  Median :    5.00         Median :  1.000   Median :  5.00       
##  Mean   :  226.05         Mean   :  5.087   Mean   : 11.71       
##  3rd Qu.:   10.12         3rd Qu.:  6.500   3rd Qu.: 12.00       
##  Max.   :45879.00         Max.   :126.000   Max.   :168.00       
##  NA's   :32               NA's   :37        NA's   :36           
##  Texting_Messaging_Hours Computer_Use_Hours Watching_TV_Hours Paid_Work_Hours 
##  Min.   :    0.0         Min.   :    0      Min.   :    0.0   Min.   : 0.000  
##  1st Qu.:    2.0         1st Qu.:    4      1st Qu.:    1.0   1st Qu.: 0.000  
##  Median :    4.0         Median :   10      Median :    4.0   Median : 0.000  
##  Mean   :  148.3         Mean   :  158      Mean   :  230.7   Mean   : 6.513  
##  3rd Qu.:   14.0         3rd Qu.:   25      3rd Qu.:    9.0   3rd Qu.:10.000  
##  Max.   :45720.0         Max.   :45692      Max.   :45945.0   Max.   :60.000  
##  NA's   :39              NA's   :44         NA's   :42        NA's   :43      
##  Work_At_Home_Hours Schoolwork_Pressure Planned_Education_Level
##  Min.   : 0.000     Length:452          Length:452             
##  1st Qu.: 1.000     Class :character    Class :character       
##  Median : 3.000     Mode  :character    Mode  :character       
##  Mean   : 5.261                                                
##  3rd Qu.: 5.000                                                
##  Max.   :76.000                                                
##  NA's   :39                                                    
##  Favorite_Music      Superpower        Preferred_Status   Role_Model_Type   
##  Length:452         Length:452         Length:452         Length:452        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Charity_Donation  
##  Length:452        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
vis_miss(df_new)

Handling missing values with numerical coloums

impute_median <- function(df, numerical_columns) {
  for (col in numerical_columns) {
    if (any(is.na(df[[col]]))) {  # Check if the column has missing values
      median_value <- median(df[[col]], na.rm = TRUE)  # Calculate median
      df[[col]][is.na(df[[col]])] <- median_value  # Replace missing values with median
      cat("Imputed missing values in column '", col, "' with median: ", median_value, "\n", sep = "")
    }
  }
  return(df)
}


df_new_1<- impute_median(df_new,numerical_columns )
## Imputed missing values in column 'Ageyears' with median: 17
## Imputed missing values in column 'Height_cm' with median: 165.1
## Imputed missing values in column 'Footlength_cm' with median: 24
## Imputed missing values in column 'Armspan_cm' with median: 163
## Imputed missing values in column 'Languages_spoken' with median: 1
## Imputed missing values in column 'Travel_time_to_School' with median: 10
## Imputed missing values in column 'Reaction_time' with median: 0.3995
## Imputed missing values in column 'Score_in_memory_game' with median: 42
## Imputed missing values in column 'Importance_reducing_pollution' with median: 743
## Imputed missing values in column 'Importance_recycling_rubbish' with median: 700
## Imputed missing values in column 'Importance_conserving_water' with median: 645
## Imputed missing values in column 'Importance_saving_energy' with median: 700
## Imputed missing values in column 'Importance_owning_computer' with median: 513
## Imputed missing values in column 'Importance_Internet_access' with median: 800
## Imputed missing values in column 'Left_Footlength_cm' with median: 24
## Imputed missing values in column 'Index_Fingerlength_mm' with median: 70.5
## Imputed missing values in column 'Ring_Fingerlength_mm' with median: 70.75
## Imputed missing values in column 'Sleep_Hours_Schoolnight' with median: 7
## Imputed missing values in column 'Sleep_Hours_Non_Schoolnight' with median: 9
## Imputed missing values in column 'Home_Occupants' with median: 4
## Imputed missing values in column 'Text_Messages_Sent_Yesterday' with median: 30
## Imputed missing values in column 'Text_Messages_Received_Yesterday' with median: 35
## Imputed missing values in column 'Hanging_Out_With_Friends_Hours' with median: 9.5
## Imputed missing values in column 'Talking_On_Phone_Hours' with median: 1.5
## Imputed missing values in column 'Doing_Homework_Hours' with median: 6
## Imputed missing values in column 'Doing_Things_With_Family_Hours' with median: 5
## Imputed missing values in column 'Outdoor_Activities_Hours' with median: 5
## Imputed missing values in column 'Video_Games_Hours' with median: 1
## Imputed missing values in column 'Social_Websites_Hours' with median: 5
## Imputed missing values in column 'Texting_Messaging_Hours' with median: 4
## Imputed missing values in column 'Computer_Use_Hours' with median: 10
## Imputed missing values in column 'Watching_TV_Hours' with median: 4
## Imputed missing values in column 'Paid_Work_Hours' with median: 0
## Imputed missing values in column 'Work_At_Home_Hours' with median: 3
check_categorical_in_numerical(df_new_1, numerical_columns)
## Column 'ClassGrade':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Ageyears':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Height_cm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Footlength_cm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Armspan_cm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Languages_spoken':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Travel_time_to_School':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Reaction_time':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Score_in_memory_game':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_reducing_pollution':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_recycling_rubbish':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_conserving_water':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_saving_energy':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_owning_computer':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Importance_Internet_access':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Left_Footlength_cm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Index_Fingerlength_mm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Ring_Fingerlength_mm':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Sleep_Hours_Schoolnight':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Sleep_Hours_Non_Schoolnight':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Home_Occupants':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Text_Messages_Sent_Yesterday':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Text_Messages_Received_Yesterday':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Hanging_Out_With_Friends_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Talking_On_Phone_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Doing_Homework_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Doing_Things_With_Family_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Outdoor_Activities_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Video_Games_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Social_Websites_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Texting_Messaging_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Computer_Use_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Watching_TV_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Paid_Work_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0 
## 
## Column 'Work_At_Home_Hours':
##   - Contains only numeric values.
##   - Number of missing values (NA): 0
gg_miss_var(df_new_1)

vis_miss(df_new_1)

  • Only missing values are in the categorical variables now.

Handling Outliers in numerical data

handle_outliers <- function(df, numerical_columns, method = "remove", iqr_multiplier = 1.5, z_threshold = 3) {
  # Function to handle outliers in numerical variables
  # Arguments:
  #   df: Data frame containing the data
  #   numerical_columns: Vector of column names (strings) to process
  #   method: "remove" (default), "impute_mean", "impute_median", or "cap"
  #   iqr_multiplier: Multiplier for IQR method (default = 1.5)
  #   z_threshold: Threshold for Z-score method (default = 3)
  
  # Loop through each numerical column
  for (col in numerical_columns) {
    # Extract the column data
    data <- df[[col]]
    
    # Detect outliers using the IQR method
    Q1 <- quantile(data, 0.25, na.rm = TRUE)
    Q3 <- quantile(data, 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    lower_bound <- Q1 - iqr_multiplier * IQR
    upper_bound <- Q3 + iqr_multiplier * IQR
    
    # Detect outliers using the Z-score method
    z_scores <- scale(data)
    z_outliers <- abs(z_scores) > z_threshold
    
    # Combine IQR and Z-score outliers
    outliers <- data < lower_bound | data > upper_bound | z_outliers
    
    # Handle outliers based on the specified method
    if (method == "remove") {
      # Remove rows with outliers
      df <- df[!outliers, ]
      cat("Outliers removed from column:", col, "\n")
    } else if (method == "impute_mean") {
      # Impute outliers with the mean
      mean_value <- mean(data, na.rm = TRUE)
      df[[col]][outliers] <- mean_value
      cat("Outliers in column", col, "imputed with mean:", mean_value, "\n")
    } else if (method == "impute_median") {
      # Impute outliers with the median
      median_value <- median(data, na.rm = TRUE)
      df[[col]][outliers] <- median_value
      cat("Outliers in column", col, "imputed with median:", median_value, "\n")
    } else if (method == "cap") {
      # Cap outliers at the lower and upper bounds
      df[[col]][data < lower_bound] <- lower_bound
      df[[col]][data > upper_bound] <- upper_bound
      cat("Outliers in column", col, "capped at bounds:", lower_bound, "and", upper_bound, "\n")
    } else {
      stop("Invalid method. Choose 'remove', 'impute_mean', 'impute_median', or 'cap'.")
    }
  }
  
  return(df)
}
df_capped <- handle_outliers(df_new_1, numerical_columns, method = "cap")
## Outliers in column ClassGrade capped at bounds: 7 and 15 
## Outliers in column Ageyears capped at bounds: 12 and 20 
## Outliers in column Height_cm capped at bounds: 127.575 and 203.775 
## Outliers in column Footlength_cm capped at bounds: 17.75 and 31.75 
## Outliers in column Armspan_cm capped at bounds: 117.5 and 209.5 
## Outliers in column Languages_spoken capped at bounds: -0.5 and 3.5 
## Outliers in column Travel_time_to_School capped at bounds: -12.5 and 39.5 
## Outliers in column Reaction_time capped at bounds: 0.08175 and 0.76775 
## Outliers in column Score_in_memory_game capped at bounds: 16.5 and 68.5 
## Outliers in column Importance_reducing_pollution capped at bounds: -175 and 1625 
## Outliers in column Importance_recycling_rubbish capped at bounds: -100 and 1500 
## Outliers in column Importance_conserving_water capped at bounds: -350 and 1650 
## Outliers in column Importance_saving_energy capped at bounds: -166.875 and 1540.125 
## Outliers in column Importance_owning_computer capped at bounds: -850 and 1950 
## Outliers in column Importance_Internet_access capped at bounds: -250 and 1750 
## Outliers in column Left_Footlength_cm capped at bounds: 17.625 and 31.025 
## Outliers in column Index_Fingerlength_mm capped at bounds: -22.5 and 141.5 
## Outliers in column Ring_Fingerlength_mm capped at bounds: -32.5 and 147.5 
## Outliers in column Sleep_Hours_Schoolnight capped at bounds: 3 and 11 
## Outliers in column Sleep_Hours_Non_Schoolnight capped at bounds: 5 and 13 
## Outliers in column Home_Occupants capped at bounds: 2.5 and 6.5 
## Outliers in column Text_Messages_Sent_Yesterday capped at bounds: -109.75 and 206.25 
## Outliers in column Text_Messages_Received_Yesterday capped at bounds: -120 and 232 
## Outliers in column Hanging_Out_With_Friends_Hours capped at bounds: -12.875 and 32.125 
## Outliers in column Talking_On_Phone_Hours capped at bounds: -3.5 and 8.5 
## Outliers in column Doing_Homework_Hours capped at bounds: -10 and 22 
## Outliers in column Doing_Things_With_Family_Hours capped at bounds: -7.5 and 20.5 
## Outliers in column Outdoor_Activities_Hours capped at bounds: -10 and 22 
## Outliers in column Video_Games_Hours capped at bounds: -9 and 15 
## Outliers in column Social_Websites_Hours capped at bounds: -11.5 and 24.5 
## Outliers in column Texting_Messaging_Hours capped at bounds: -13 and 27 
## Outliers in column Computer_Use_Hours capped at bounds: -22.375 and 50.625 
## Outliers in column Watching_TV_Hours capped at bounds: -7 and 17 
## Outliers in column Paid_Work_Hours capped at bounds: -12 and 20 
## Outliers in column Work_At_Home_Hours capped at bounds: -5 and 11
str(df_capped)
## 'data.frame':    452 obs. of  60 variables:
##  $ Country                         : chr  "USA" "USA" "USA" "USA" ...
##  $ Region                          : chr  "GA" "NE" "SC" "TX" ...
##  $ DataYear                        : num  2022 2017 2020 2023 2014 ...
##  $ ClassGrade                      : num  12 10 12 12 12 8 11 12 10 12 ...
##  $ Gender                          : chr  "Male" "Male" "Female" "Female" ...
##  $ Ageyears                        : num  17 16 18 17 17 12 16 17 15 16 ...
##  $ Handed                          : chr  "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
##  $ Height_cm                       : num  182 128 165 172 165 ...
##  $ Footlength_cm                   : num  31.8 31.8 24 26 25 ...
##  $ Armspan_cm                      : num  193 170 163 177 168 ...
##  $ Languages_spoken                : num  1 2 1 1 1 1 2 1 1.5 2 ...
##  $ Travel_to_School                : chr  "Car" "Car" "Car" "Car" ...
##  $ Travel_time_to_School           : num  10 30 10 7 10 15 10 7 10 7 ...
##  $ Reaction_time                   : num  0.324 0.34 0.768 0.328 0.501 ...
##  $ Score_in_memory_game            : num  37 50 39 35 40 61 32 33 30 30 ...
##  $ Favourite_physical_activity     : chr  "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
##  $ Importance_reducing_pollution   : num  743 900 299 650 800 572 900 450 200 791 ...
##  $ Importance_recycling_rubbish    : num  700 900 1000 600 900 583 500 450 100 503 ...
##  $ Importance_conserving_water     : num  645 900 326 450 850 620 300 450 400 834 ...
##  $ Importance_saving_energy        : num  700 800 487 500 800 695 300 450 750 856 ...
##  $ Importance_owning_computer      : num  513 500 0 200 800 754 800 1000 900 1000 ...
##  $ Importance_Internet_access      : num  800 600 1000 500 800 856 800 1000 900 1000 ...
##  $ Left_Footlength_cm              : num  30 31 24 26 25 24.2 24 22 26 23 ...
##  $ Longer_foot                     : chr  "Right foot" "Right foot" "Same length" "Right foot" ...
##  $ Index_Fingerlength_mm           : num  100 40 70.5 7 75 67 77 80 82 70 ...
##  $ Ring_Fingerlength_mm            : num  105 50 70.8 7 75 ...
##  $ Longer_Finger_Lefthand          : chr  "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
##  $ Birth_month                     : chr  "January" "September" "September" "February" ...
##  $ Favorite_Season                 : chr  "Winter" "Summer" "Spring" "Spring" ...
##  $ Allergies                       : chr  "No" "Yes" "No" "Yes" ...
##  $ Vegetarian                      : chr  "No" "No" "No" "No" ...
##  $ Favorite_Food                   : chr  "Meat" "Meat" "Seafood" "Poultry" ...
##  $ Beverage                        : chr  "Water" "Water" "Water" "Water" ...
##  $ Favorite_School_Subject         : chr  "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
##  $ Sleep_Hours_Schoolnight         : num  8 7 7 7 8 7 6 6.5 6 6 ...
##  $ Sleep_Hours_Non_Schoolnight     : num  6 9 9 9 11 9 8 9 11 6 ...
##  $ Home_Occupants                  : num  3 5 4 6 6.5 4 4 6 2.5 4 ...
##  $ Home_Internet_Access            : chr  "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
##  $ Communication_With_Friends      : chr  "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
##  $ Text_Messages_Sent_Yesterday    : num  206 20 4 15 7 ...
##  $ Text_Messages_Received_Yesterday: num  232 30 2 15 15 0 232 5 25 11 ...
##  $ Hanging_Out_With_Friends_Hours  : num  10 32.1 9.5 10 2 ...
##  $ Talking_On_Phone_Hours          : num  2 2 1.5 1 1 2 8.5 0 1 2 ...
##  $ Doing_Homework_Hours            : num  12 2 6 0 9 20 10 6 4 10 ...
##  $ Doing_Things_With_Family_Hours  : num  2 20.5 5 1 7 20.5 0 10 20 20.5 ...
##  $ Outdoor_Activities_Hours        : num  18 14 5 0 11 22 0 0 20 12 ...
##  $ Video_Games_Hours               : num  3 2 1 0 0 15 15 14 15 15 ...
##  $ Social_Websites_Hours           : num  1 6 5 3 20 0 16 7 15 4 ...
##  $ Texting_Messaging_Hours         : num  4 1 4 20 1 1 24 2 1 3 ...
##  $ Computer_Use_Hours              : num  10 3 10 50 21 25 30 21 25 39 ...
##  $ Watching_TV_Hours               : num  4 17 4 5 11 7 5 14 5 4 ...
##  $ Paid_Work_Hours                 : num  15 0 0 15 15 0 4 0 0 0 ...
##  $ Work_At_Home_Hours              : num  0.5 4 3 7 2 0 2 6 4 0 ...
##  $ Schoolwork_Pressure             : chr  "Very little" "Some" "A lot" "Some" ...
##  $ Planned_Education_Level         : chr  "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
##  $ Favorite_Music                  : chr  "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
##  $ Superpower                      : chr  "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
##  $ Preferred_Status                : chr  "Happy" "Happy" "Happy" "Happy" ...
##  $ Role_Model_Type                 : chr  "Business person" "Relative" "Relative" "Friend" ...
##  $ Charity_Donation                : chr  "Religious" "Health" "International aid" "International aid" ...

Categorical data handling

categorical_variables <- c(
  "Country",                          
  "Region",                           
  "ClassGrade",                      
  "Gender",                           
  "Handed",                           
  "Travel_to_School",                 
  "Favourite_physical_activity",     
  "Longer_foot",                      
  "Longer_Finger_Lefthand",           
  "Birth_month",                      
  "Favorite_Season",                  
  "Allergies",                        
  "Vegetarian",                       
  "Favorite_Food",                    
  "Beverage",                       
  "Favorite_School_Subject",          
  "Home_Internet_Access",             
  "Communication_With_Friends",       
  "Favorite_Music",                   
  "Superpower",                       
  "Preferred_Status",                 
  "Role_Model_Type",                  
  "Charity_Donation",
  "Planned_Education_Level",
  "Schoolwork_Pressure"
)
check_cat_missing_values <- function(df, categorical_variables) {
  missing_summary <- sapply(df[, categorical_variables], function(col) sum(is.na(col)))
  return(missing_summary)
}

# Check missing values
missing_values_cat <- check_cat_missing_values(df_capped, categorical_variables)
print(missing_values_cat)
##                     Country                      Region 
##                           0                           0 
##                  ClassGrade                      Gender 
##                           0                           1 
##                      Handed            Travel_to_School 
##                           1                           1 
## Favourite_physical_activity                 Longer_foot 
##                           1                          16 
##      Longer_Finger_Lefthand                 Birth_month 
##                          19                           3 
##             Favorite_Season                   Allergies 
##                           2                           5 
##                  Vegetarian               Favorite_Food 
##                           7                           3 
##                    Beverage     Favorite_School_Subject 
##                           4                           5 
##        Home_Internet_Access  Communication_With_Friends 
##                          12                          19 
##              Favorite_Music                  Superpower 
##                          39                          35 
##            Preferred_Status             Role_Model_Type 
##                          34                          36 
##            Charity_Donation     Planned_Education_Level 
##                          34                          34 
##         Schoolwork_Pressure 
##                          31
calculate_mode <- function(col) {
  mode <- names(sort(table(col), decreasing = TRUE))[1]
  return(mode)
}


impute_missing_values_cat <- function(df, categorical_variables) {
  for (col in categorical_variables) {
    cat("Processing column:", col, "\n")
    cat("Class of column:", class(df[[col]]), "\n")
    if (sum(is.na(df[[col]])) > 0) {  # Check if the column has missing values
      mode_value <- calculate_mode(df[[col]])
      df[[col]][is.na(df[[col]])] <- mode_value  # Replace missing values with the mode
      cat("Imputed missing values in column '", col, "' with mode: '", mode_value, "'\n", sep = "")
    }
  }
  return(df)
}


df_imputed <- impute_missing_values_cat(df_capped, categorical_variables)
## Processing column: Country 
## Class of column: character 
## Processing column: Region 
## Class of column: character 
## Processing column: ClassGrade 
## Class of column: numeric 
## Processing column: Gender 
## Class of column: character 
## Imputed missing values in column 'Gender' with mode: 'Female'
## Processing column: Handed 
## Class of column: character 
## Imputed missing values in column 'Handed' with mode: 'Right-Handed'
## Processing column: Travel_to_School 
## Class of column: character 
## Imputed missing values in column 'Travel_to_School' with mode: 'Car'
## Processing column: Favourite_physical_activity 
## Class of column: character 
## Imputed missing values in column 'Favourite_physical_activity' with mode: 'Other'
## Processing column: Longer_foot 
## Class of column: character 
## Imputed missing values in column 'Longer_foot' with mode: 'Same length'
## Processing column: Longer_Finger_Lefthand 
## Class of column: character 
## Imputed missing values in column 'Longer_Finger_Lefthand' with mode: 'Ring finger'
## Processing column: Birth_month 
## Class of column: character 
## Imputed missing values in column 'Birth_month' with mode: 'September'
## Processing column: Favorite_Season 
## Class of column: character 
## Imputed missing values in column 'Favorite_Season' with mode: 'Summer'
## Processing column: Allergies 
## Class of column: character 
## Imputed missing values in column 'Allergies' with mode: 'No'
## Processing column: Vegetarian 
## Class of column: character 
## Imputed missing values in column 'Vegetarian' with mode: 'No'
## Processing column: Favorite_Food 
## Class of column: character 
## Imputed missing values in column 'Favorite_Food' with mode: 'Meat'
## Processing column: Beverage 
## Class of column: character 
## Imputed missing values in column 'Beverage' with mode: 'Water'
## Processing column: Favorite_School_Subject 
## Class of column: character 
## Imputed missing values in column 'Favorite_School_Subject' with mode: 'Mathematics and statistics'
## Processing column: Home_Internet_Access 
## Class of column: character 
## Imputed missing values in column 'Home_Internet_Access' with mode: 'Yes - broadband connection'
## Processing column: Communication_With_Friends 
## Class of column: character 
## Imputed missing values in column 'Communication_With_Friends' with mode: 'Text messaging'
## Processing column: Favorite_Music 
## Class of column: character 
## Imputed missing values in column 'Favorite_Music' with mode: 'Rap/Hip hop'
## Processing column: Superpower 
## Class of column: character 
## Imputed missing values in column 'Superpower' with mode: 'Telepathy'
## Processing column: Preferred_Status 
## Class of column: character 
## Imputed missing values in column 'Preferred_Status' with mode: 'Happy'
## Processing column: Role_Model_Type 
## Class of column: character 
## Imputed missing values in column 'Role_Model_Type' with mode: 'Relative'
## Processing column: Charity_Donation 
## Class of column: character 
## Imputed missing values in column 'Charity_Donation' with mode: 'Health'
## Processing column: Planned_Education_Level 
## Class of column: character 
## Imputed missing values in column 'Planned_Education_Level' with mode: 'Graduate degree'
## Processing column: Schoolwork_Pressure 
## Class of column: character 
## Imputed missing values in column 'Schoolwork_Pressure' with mode: 'Some'
missing_values_after_imputation <- impute_missing_values_cat(df_imputed, categorical_variables)
## Processing column: Country 
## Class of column: character 
## Processing column: Region 
## Class of column: character 
## Processing column: ClassGrade 
## Class of column: numeric 
## Processing column: Gender 
## Class of column: character 
## Processing column: Handed 
## Class of column: character 
## Processing column: Travel_to_School 
## Class of column: character 
## Processing column: Favourite_physical_activity 
## Class of column: character 
## Processing column: Longer_foot 
## Class of column: character 
## Processing column: Longer_Finger_Lefthand 
## Class of column: character 
## Processing column: Birth_month 
## Class of column: character 
## Processing column: Favorite_Season 
## Class of column: character 
## Processing column: Allergies 
## Class of column: character 
## Processing column: Vegetarian 
## Class of column: character 
## Processing column: Favorite_Food 
## Class of column: character 
## Processing column: Beverage 
## Class of column: character 
## Processing column: Favorite_School_Subject 
## Class of column: character 
## Processing column: Home_Internet_Access 
## Class of column: character 
## Processing column: Communication_With_Friends 
## Class of column: character 
## Processing column: Favorite_Music 
## Class of column: character 
## Processing column: Superpower 
## Class of column: character 
## Processing column: Preferred_Status 
## Class of column: character 
## Processing column: Role_Model_Type 
## Class of column: character 
## Processing column: Charity_Donation 
## Class of column: character 
## Processing column: Planned_Education_Level 
## Class of column: character 
## Processing column: Schoolwork_Pressure 
## Class of column: character
gg_miss_var(df_imputed)

Looking at interested variables

 # Categorical variables in to factors

df_imputed$Region <- as.factor(df_imputed$Region)
df_imputed$Gender <- as.factor(df_imputed$Gender)
df_imputed$Handed <- as.factor(df_imputed$Handed)
df_imputed$Schoolwork_Pressure <- as.factor(df_imputed$Schoolwork_Pressure)
df_imputed$Travel_to_School <- as.factor(df_imputed$Travel_to_School)
df_imputed$Beverage <-  as.factor(df_imputed$Beverage)
df_imputed$Home_Internet_Access <-   as.factor(df_imputed$Home_Internet_Access )
df_imputed$Favorite_School_Subject <-   as.factor(df_imputed$Favorite_School_Subject)
df_imputed$Schoolwork_Pressure <-   as.factor(df_imputed$Schoolwork_Pressure)
 # Categorical variables in to factors

interest_cat <- c(                       
  "Region",                           
  "Gender",                           
  "Handed",                           
  "Travel_to_School",                 
  "Favourite_physical_activity",     
  "Beverage",                       
  "Favorite_School_Subject",          
  "Home_Internet_Access",             
  "Schoolwork_Pressure")


interest_num <- c("Ageyears","Travel_time_to_School" ,"Reaction_time" ,"Score_in_memory_game" ,"Sleep_Hours_Schoolnight","Sleep_Hours_Non_Schoolnight","Home_Occupants","Text_Messages_Sent_Yesterday", "Text_Messages_Received_Yesterday","Hanging_Out_With_Friends_Hours" ,"Talking_On_Phone_Hours","Doing_Homework_Hours" ,"Doing_Things_With_Family_Hours","Outdoor_Activities_Hours","Video_Games_Hours","Social_Websites_Hours"  , "Texting_Messaging_Hours" , "Computer_Use_Hours" , "Watching_TV_Hours","Paid_Work_Hours" ,"Work_At_Home_Hours")
# Subset numerical variables of interest
num_vars <- df_imputed[, c(interest_num)]

# Calculate correlation matrix
cor_matrix <- cor(num_vars, use = "complete.obs")

# Visualize correlation matrix

corrplot(cor_matrix, 
         method = "circle",  # Use circles to represent correlation
         tl.cex = 0.8,       # Text size
         tl.col = "darkorchid")     # Change text color to red

Derived variables

df_imputed$Avg_Night_Sleep <- rowMeans(df_imputed[, c("Sleep_Hours_Schoolnight", "Sleep_Hours_Non_Schoolnight")])

Pressure ?

df_imputed$Schoolwork_Pressure_Binary <- ifelse(df_imputed$Schoolwork_Pressure %in% c("None", "Very little"), "Low Pressure", "High Pressure")
df_imputed$Schoolwork_Pressure_Binary <- as.factor(df_imputed$Schoolwork_Pressure_Binary)

Technology ?

df_imputed$Importance_of_Technology_score <- rowMeans(df_imputed[, c( "Importance_Internet_access", "Importance_owning_computer" )])
df_imputed$Mean_daily_screen_time<- rowSums(df_imputed[, c("Video_Games_Hours", "Computer_Use_Hours", 
                                                 "Social_Websites_Hours", "Watching_TV_Hours", 
                                                 "Texting_Messaging_Hours")]) / 7

Physical and Socail Behaviours

df_imputed$Mean_daily_Physical_and_Social_Activity <-  rowSums(df_imputed[, c("Outdoor_Activities_Hours" ,"Hanging_Out_With_Friends_Hours" ,"Work_At_Home_Hours","Doing_Things_With_Family_Hours"  )])/ 7

Visualizing categorical varaibles

ggplot(df_imputed, aes(x = Region)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Region", x = "Region", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Gender)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Gender", x = "Gender", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Handed)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Handed", x = "Handed", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Travel_to_School)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Travel_to_School", x = "Travel_to_School", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Favourite_physical_activity)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Favourite_physical_activity", x = "Favourite_physical_activity", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Beverage)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Beverage", x = "Beverage", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Favorite_School_Subject)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Favorite_School_Subject", x = "Favorite_School_Subject", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Home_Internet_Access)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of Home_Internet_Access", x = "Home_Internet_Access", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

df_imputed$Schoolwork_Pressure <- factor(df_imputed$Schoolwork_Pressure, 
                                          levels = c("None", "Very little", "Some", "A lot"))

ggplot(df_imputed, aes(x = Schoolwork_Pressure)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of School Work Pressure", x = "Schoolwork_Pressure", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(df_imputed, aes(x = Schoolwork_Pressure_Binary)) +
  geom_bar(fill = "darkseagreen", color = "black") +
  theme_minimal() +
  labs(title = "Bar Plot of School work_Pressure", x = "Schoolwork_Pressure", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Visualizing numerical varaibles

Paid Work Hours

ggplot(df_imputed, aes(x =Paid_Work_Hours )) +
  geom_density(fill = "blue", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Paid_Work_Hours", x = "Paid_Work_Hours", y = "Density")

Reaction Time

ggplot(df_imputed, aes(x = Reaction_time)) +
  geom_density(fill = "blue", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Reaction Time", x = "Reaction Time", y = "Density")

Score in memory game

ggplot(df_imputed, aes(x = Score_in_memory_game)) +
  geom_density(fill = "coral4", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Score_in_memory_game", x = "Score_in_memory_game", y = "Density")

Score in Screen time

ggplot(df_imputed, aes(x = Mean_daily_screen_time)) +
  geom_density(fill = "royalblue4", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Mean_daily_screen_time", x = "Mean_daily_screen_time", y = "Density")

Avearge night sleep

ggplot(df_imputed, aes(x = Avg_Night_Sleep)) +
  geom_density(fill = "limegreen", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Avg_Night_Sleep", x = "Avg_Night_Sleep", y = "Density")

Density Plot of Physical_and_Social_Activity

ggplot(df_imputed, aes(x = Mean_daily_Physical_and_Social_Activity)) +
  geom_density(fill = "darkkhaki", alpha = 0.4) +
  theme_minimal() +
  labs(title = "Density Plot of Mean_daily_Physical_and_Social_Activity", x = "Mean_daily_Physical_and_Social_Activity", y = "Density")

Visualizing numerical varaible and categorical variables

boxplot(Reaction_time ~ Schoolwork_Pressure_Binary, data = df_imputed, 
        main = "Boxplot of Reaction_time by Schoolwork_Pressure", 
        xlab = "Schoolwork_PressureColumn", ylab = "Reaction_time", 
        col = "maroon")

boxplot(Score_in_memory_game ~ Schoolwork_Pressure_Binary, data = df_imputed, 
        main = "Boxplot of Score_in_memory_game by Schoolwork_Pressure", 
        xlab = "Schoolwork_PressureColumn", ylab = "Score_in_memory_game", 
        col = "lightblue")

boxplot(Mean_daily_screen_time~ Schoolwork_Pressure_Binary, data = df_imputed, 
        main = "Boxplot of Mean_daily_screen_time by Schoolwork_Pressure", 
        xlab = "Schoolwork_PressureColumn", ylab = "Mean_daily_screen_time", 
        col = "limegreen")

boxplot(Mean_daily_Physical_and_Social_Activity~ Schoolwork_Pressure_Binary, data = df_imputed, 
        main = "Boxplot of Mean_daily_Physical_and_Social_Activity by Schoolwork_Pressure", 
        xlab = "Schoolwork_PressureColumn", ylab = "Mean_daily_Physical_and_Social_Activity", 
        col = "powderblue")

Analysing

df_selected <- df_imputed %>%
  select(Schoolwork_Pressure_Binary, Mean_daily_screen_time, Mean_daily_Physical_and_Social_Activity
         , 
         Home_Occupants, Avg_Night_Sleep)

full_model <- glm(Schoolwork_Pressure_Binary ~ ., data = df_selected , family = binomial)
summary(full_model)
## 
## Call:
## glm(formula = Schoolwork_Pressure_Binary ~ ., family = binomial, 
##     data = df_selected)
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -2.578662   0.950302  -2.714  0.00666
## Mean_daily_screen_time                  -0.060654   0.033830  -1.793  0.07298
## Mean_daily_Physical_and_Social_Activity  0.148522   0.051388   2.890  0.00385
## Home_Occupants                           0.141829   0.115378   1.229  0.21898
## Avg_Night_Sleep                          0.003684   0.096529   0.038  0.96956
##                                           
## (Intercept)                             **
## Mean_daily_screen_time                  . 
## Mean_daily_Physical_and_Social_Activity **
## Home_Occupants                            
## Avg_Night_Sleep                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 409.45  on 451  degrees of freedom
## Residual deviance: 398.19  on 447  degrees of freedom
## AIC: 408.19
## 
## Number of Fisher Scoring iterations: 4
stepwise_model <- step(full_model, direction = "both")
## Start:  AIC=408.19
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity + 
##     Home_Occupants + Avg_Night_Sleep
## 
##                                           Df Deviance    AIC
## - Avg_Night_Sleep                          1   398.20 406.20
## - Home_Occupants                           1   399.70 407.70
## <none>                                         398.19 408.19
## - Mean_daily_screen_time                   1   401.62 409.62
## - Mean_daily_Physical_and_Social_Activity  1   406.41 414.41
## 
## Step:  AIC=406.2
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity + 
##     Home_Occupants
## 
##                                           Df Deviance    AIC
## - Home_Occupants                           1   399.70 405.70
## <none>                                         398.20 406.20
## - Mean_daily_screen_time                   1   401.64 407.64
## + Avg_Night_Sleep                          1   398.19 408.19
## - Mean_daily_Physical_and_Social_Activity  1   406.41 412.41
## 
## Step:  AIC=405.7
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity
## 
##                                           Df Deviance    AIC
## <none>                                         399.70 405.70
## + Home_Occupants                           1   398.20 406.20
## - Mean_daily_screen_time                   1   403.13 407.13
## + Avg_Night_Sleep                          1   399.70 407.70
## - Mean_daily_Physical_and_Social_Activity  1   408.67 412.67
summary(stepwise_model)
## 
## Call:
## glm(formula = Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + 
##     Mean_daily_Physical_and_Social_Activity, family = binomial, 
##     data = df_selected)
## 
## Coefficients:
##                                         Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -1.95455    0.28353  -6.894 5.44e-12
## Mean_daily_screen_time                  -0.06049    0.03371  -1.794  0.07273
## Mean_daily_Physical_and_Social_Activity  0.15424    0.05108   3.020  0.00253
##                                            
## (Intercept)                             ***
## Mean_daily_screen_time                  .  
## Mean_daily_Physical_and_Social_Activity ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 409.45  on 451  degrees of freedom
## Residual deviance: 399.70  on 449  degrees of freedom
## AIC: 405.7
## 
## Number of Fisher Scoring iterations: 4
write.csv(df_selected, file = "df_selected.csv", row.names = FALSE)
write.csv(df_imputed, file = "df_imputed.csv", row.names = FALSE)
# Create nodes data frame
nodes <- data.frame(
  id = c("center", "screen_time", "activity", "home_occupants", "sleep"),
  label = c("Perception on\nSchool Work Pressure", 
            "Mean Daily\nScreen Time", 
            "Mean Daily\nPhysical &\nSocial Activity",
            "Home \nOccupants", 
            "Average \nNight Sleep"),
  x = c(0, -1, 1, -1, 1),
  y = c(0, 1, 1, -1, -1),
  size = c(80, 80, 80, 80, 80)
)

# Create edges data frame (connections between nodes)
edges <- data.frame(
  from = c("screen_time", "activity", "home_occupants", "sleep"),
  to = rep("center", 4),
  arrow.size = 0.5
)

# Create the graph
g <- graph_from_data_frame(edges, vertices = nodes, directed = TRUE)

# Plot the graph
plot(g, 
     layout = as.matrix(nodes[, c("x", "y")]),
     vertex.label = nodes$label,
     vertex.size = nodes$size,
     vertex.color = c("darkseagreen3", rep("lightsalmon", 5)),
     vertex.frame.color = "gray",
     vertex.label.color = "black",
     vertex.label.cex = 0.8,
     edge.arrow.size = 0.3,
     edge.arrow.width = 1,
     edge.color = "red4",
     main = "Factors that could influenced Perception of School Work Pressure")