df <- read.xlsx("data.xlsx")
str(df)
## 'data.frame': 452 obs. of 60 variables:
## $ Country : chr "USA" "USA" "USA" "USA" ...
## $ Region : chr "GA" "NE" "SC" "TX" ...
## $ DataYear : num 2022 2017 2020 2023 2014 ...
## $ ClassGrade : num 12 10 12 12 12 8 11 12 10 12 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Ageyears : num 17 16 18 17 17 11 16 17 15 16 ...
## $ Handed : chr "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
## $ Height_cm : chr "182" "70" "5'3" "172" ...
## $ Footlength_cm : chr "33" "32" NA "26" ...
## $ Armspan_cm : chr "193" "170" NA "177" ...
## $ Languages_spoken : num 1 2 1 1 1 1 2 1 1.5 2 ...
## $ Travel_to_School : chr "Car" "Car" "Car" "Car" ...
## $ Travel_time_to_School : chr "10" "30" NA "7" ...
## $ Reaction_time : num 0.324 0.34 3.061 0.328 0.501 ...
## $ Score_in_memory_game : chr "37" "50" "39" "35" ...
## $ Favourite_physical_activity : chr "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
## $ Importance_reducing_pollution : num NA 900 299 650 800 572 900 450 200 791 ...
## $ Importance_recycling_rubbish : num NA 900 1000 600 900 583 500 450 100 503 ...
## $ Importance_conserving_water : chr NA "900" "326" "450" ...
## $ Importance_saving_energy : num NA 800 487 500 800 695 300 450 750 856 ...
## $ Importance_owning_computer : num NA 500 0 200 800 754 800 1000 900 1000 ...
## $ Importance_Internet_access : num NA 600 1000 500 800 856 800 1000 900 1000 ...
## $ Left_Footlength_cm : chr "30" "31" NA "26" ...
## $ Longer_foot : chr "Right foot" "Right foot" "Same length" "Right foot" ...
## $ Index_Fingerlength_mm : chr "100" "40" NA "7" ...
## $ Ring_Fingerlength_mm : chr "105" "50" NA "7" ...
## $ Longer_Finger_Lefthand : chr "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
## $ Birth_month : chr "January" "September" "September" "February" ...
## $ Favorite_Season : chr "Winter" "Summer" "Spring" "Spring" ...
## $ Allergies : chr "No" "Yes" "No" "Yes" ...
## $ Vegetarian : chr "No" "No" "No" "No" ...
## $ Favorite_Food : chr "Meat" "Meat" "Seafood" "Poultry" ...
## $ Beverage : chr "Water" "Water" "Water" "Water" ...
## $ Favorite_School_Subject : chr "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
## $ Sleep_Hours_Schoolnight : chr "8" "7" NA "7" ...
## $ Sleep_Hours_Non_Schoolnight : chr "6" "9" NA "9" ...
## $ Home_Occupants : num 3 5 4 6 8 4 4 6 2 4 ...
## $ Home_Internet_Access : chr "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
## $ Communication_With_Friends : chr "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
## $ Text_Messages_Sent_Yesterday : chr "260" "20" "4" "15" ...
## $ Text_Messages_Received_Yesterday: chr "274" "30" "2" "15" ...
## $ Hanging_Out_With_Friends_Hours : chr "10" "48" NA "10" ...
## $ Talking_On_Phone_Hours : chr "2" "2" NA "1" ...
## $ Doing_Homework_Hours : chr "12" "2" NA "0" ...
## $ Doing_Things_With_Family_Hours : chr "2" "48" NA "1" ...
## $ Outdoor_Activities_Hours : chr "18" "14" NA "0" ...
## $ Video_Games_Hours : chr "3" "2" NA "0" ...
## $ Social_Websites_Hours : chr "1" "6" NA "3" ...
## $ Texting_Messaging_Hours : chr "4" "1" NA "20" ...
## $ Computer_Use_Hours : chr "10" "3" NA "50" ...
## $ Watching_TV_Hours : chr "4" "20" NA "5" ...
## $ Paid_Work_Hours : chr "15" "0" NA "15" ...
## $ Work_At_Home_Hours : chr "0.5" "4" NA "7" ...
## $ Schoolwork_Pressure : chr "Very little" "Some" "A lot" "Some" ...
## $ Planned_Education_Level : chr "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
## $ Favorite_Music : chr "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
## $ Superpower : chr "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
## $ Preferred_Status : chr "Happy" "Happy" "Happy" "Happy" ...
## $ Role_Model_Type : chr "Business person" "Relative" "Relative" "Friend" ...
## $ Charity_Donation : chr "Religious" "Health" "International aid" "International aid" ...
summary(df)
## Country Region DataYear ClassGrade
## Length:452 Length:452 Min. :2010 Min. : 4.00
## Class :character Class :character 1st Qu.:2016 1st Qu.:10.00
## Mode :character Mode :character Median :2018 Median :12.00
## Mean :2018 Mean :10.54
## 3rd Qu.:2021 3rd Qu.:12.00
## Max. :2024 Max. :12.00
##
## Gender Ageyears Handed Height_cm
## Length:452 Min. :10.0 Length:452 Length:452
## Class :character 1st Qu.:15.0 Class :character Class :character
## Mode :character Median :17.0 Mode :character Mode :character
## Mean :16.1
## 3rd Qu.:17.0
## Max. :63.0
## NA's :1
## Footlength_cm Armspan_cm Languages_spoken Travel_to_School
## Length:452 Length:452 Min. :1.000 Length:452
## Class :character Class :character 1st Qu.:1.000 Class :character
## Mode :character Mode :character Median :1.000 Mode :character
## Mean :1.511
## 3rd Qu.:2.000
## Max. :7.000
## NA's :5
## Travel_time_to_School Reaction_time Score_in_memory_game
## Length:452 Min. : 0.071 Length:452
## Class :character 1st Qu.: 0.338 Class :character
## Mode :character Median : 0.400 Mode :character
## Mean : 77.743
## 3rd Qu.: 0.520
## Max. :31402.000
## NA's :16
## Favourite_physical_activity Importance_reducing_pollution
## Length:452 Min. : 0.0
## Class :character 1st Qu.: 500.0
## Mode :character Median : 743.0
## Mean : 707.9
## 3rd Qu.: 979.5
## Max. :10000.0
## NA's :21
## Importance_recycling_rubbish Importance_conserving_water
## Min. : 0.0 Length:452
## 1st Qu.: 487.0 Class :character
## Median : 700.0 Mode :character
## Mean : 719.6
## 3rd Qu.: 900.0
## Max. :10000.0
## NA's :21
## Importance_saving_energy Importance_owning_computer Importance_Internet_access
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 403.0 1st Qu.: 200.0 1st Qu.: 500.0
## Median : 700.0 Median : 513.0 Median : 800.0
## Mean : 678.9 Mean : 557.4 Mean : 778.3
## 3rd Qu.: 900.0 3rd Qu.: 900.0 3rd Qu.: 1000.0
## Max. :10000.0 Max. :7000.0 Max. :10000.0
## NA's :46 NA's :21 NA's :21
## Left_Footlength_cm Longer_foot Index_Fingerlength_mm
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Ring_Fingerlength_mm Longer_Finger_Lefthand Birth_month
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Favorite_Season Allergies Vegetarian Favorite_Food
## Length:452 Length:452 Length:452 Length:452
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Beverage Favorite_School_Subject Sleep_Hours_Schoolnight
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Sleep_Hours_Non_Schoolnight Home_Occupants Home_Internet_Access
## Length:452 Min. : 1.0 Length:452
## Class :character 1st Qu.: 3.0 Class :character
## Mode :character Median : 4.0 Mode :character
## Mean : 108.1
## 3rd Qu.: 5.0
## Max. :45752.0
## NA's :11
## Communication_With_Friends Text_Messages_Sent_Yesterday
## Length:452 Length:452
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Text_Messages_Received_Yesterday Hanging_Out_With_Friends_Hours
## Length:452 Length:452
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Talking_On_Phone_Hours Doing_Homework_Hours Doing_Things_With_Family_Hours
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Outdoor_Activities_Hours Video_Games_Hours Social_Websites_Hours
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Texting_Messaging_Hours Computer_Use_Hours Watching_TV_Hours
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Paid_Work_Hours Work_At_Home_Hours Schoolwork_Pressure
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Planned_Education_Level Favorite_Music Superpower
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Preferred_Status Role_Model_Type Charity_Donation
## Length:452 Length:452 Length:452
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
num_vars <- sum(sapply(df, is.numeric))
cat_vars <- sum(sapply(df, is.factor)) +
sum(sapply(df, is.character))
cat("Numerical variables:", num_vars, "\n")
## Numerical variables: 11
cat("Categorical variables:", cat_vars, "\n")
## Categorical variables: 49
options(repr.plot.width = 20, repr.plot.height = 30)
gg_miss_var(df)
missing_percent <- colMeans(is.na(df)) * 100
sort(missing_percent)
## Country Region
## 0.0000000 0.0000000
## DataYear ClassGrade
## 0.0000000 0.0000000
## Gender Ageyears
## 0.2212389 0.2212389
## Handed Travel_to_School
## 0.2212389 0.2212389
## Favourite_physical_activity Favorite_Season
## 0.2212389 0.4424779
## Birth_month Favorite_Food
## 0.6637168 0.6637168
## Beverage Languages_spoken
## 0.8849558 1.1061947
## Travel_time_to_School Allergies
## 1.1061947 1.1061947
## Favorite_School_Subject Vegetarian
## 1.1061947 1.5486726
## Sleep_Hours_Schoolnight Sleep_Hours_Non_Schoolnight
## 1.9911504 1.9911504
## Home_Occupants Home_Internet_Access
## 2.4336283 2.6548673
## Height_cm Text_Messages_Sent_Yesterday
## 2.8761062 3.0973451
## Text_Messages_Received_Yesterday Reaction_time
## 3.0973451 3.5398230
## Longer_foot Left_Footlength_cm
## 3.5398230 4.2035398
## Longer_Finger_Lefthand Communication_With_Friends
## 4.2035398 4.2035398
## Footlength_cm Importance_reducing_pollution
## 4.4247788 4.6460177
## Importance_recycling_rubbish Importance_conserving_water
## 4.6460177 4.6460177
## Importance_owning_computer Importance_Internet_access
## 4.6460177 4.6460177
## Score_in_memory_game Hanging_Out_With_Friends_Hours
## 4.8672566 4.8672566
## Talking_On_Phone_Hours Doing_Homework_Hours
## 5.9734513 5.9734513
## Doing_Things_With_Family_Hours Outdoor_Activities_Hours
## 6.6371681 6.6371681
## Schoolwork_Pressure Ring_Fingerlength_mm
## 6.8584071 7.0796460
## Social_Websites_Hours Planned_Education_Level
## 7.5221239 7.5221239
## Preferred_Status Charity_Donation
## 7.5221239 7.5221239
## Video_Games_Hours Superpower
## 7.7433628 7.7433628
## Work_At_Home_Hours Role_Model_Type
## 7.9646018 7.9646018
## Index_Fingerlength_mm Texting_Messaging_Hours
## 8.1858407 8.1858407
## Computer_Use_Hours Watching_TV_Hours
## 8.4070796 8.6283186
## Favorite_Music Armspan_cm
## 8.6283186 8.8495575
## Paid_Work_Hours Importance_saving_energy
## 8.8495575 10.1769912
colnames(df)
## [1] "Country" "Region"
## [3] "DataYear" "ClassGrade"
## [5] "Gender" "Ageyears"
## [7] "Handed" "Height_cm"
## [9] "Footlength_cm" "Armspan_cm"
## [11] "Languages_spoken" "Travel_to_School"
## [13] "Travel_time_to_School" "Reaction_time"
## [15] "Score_in_memory_game" "Favourite_physical_activity"
## [17] "Importance_reducing_pollution" "Importance_recycling_rubbish"
## [19] "Importance_conserving_water" "Importance_saving_energy"
## [21] "Importance_owning_computer" "Importance_Internet_access"
## [23] "Left_Footlength_cm" "Longer_foot"
## [25] "Index_Fingerlength_mm" "Ring_Fingerlength_mm"
## [27] "Longer_Finger_Lefthand" "Birth_month"
## [29] "Favorite_Season" "Allergies"
## [31] "Vegetarian" "Favorite_Food"
## [33] "Beverage" "Favorite_School_Subject"
## [35] "Sleep_Hours_Schoolnight" "Sleep_Hours_Non_Schoolnight"
## [37] "Home_Occupants" "Home_Internet_Access"
## [39] "Communication_With_Friends" "Text_Messages_Sent_Yesterday"
## [41] "Text_Messages_Received_Yesterday" "Hanging_Out_With_Friends_Hours"
## [43] "Talking_On_Phone_Hours" "Doing_Homework_Hours"
## [45] "Doing_Things_With_Family_Hours" "Outdoor_Activities_Hours"
## [47] "Video_Games_Hours" "Social_Websites_Hours"
## [49] "Texting_Messaging_Hours" "Computer_Use_Hours"
## [51] "Watching_TV_Hours" "Paid_Work_Hours"
## [53] "Work_At_Home_Hours" "Schoolwork_Pressure"
## [55] "Planned_Education_Level" "Favorite_Music"
## [57] "Superpower" "Preferred_Status"
## [59] "Role_Model_Type" "Charity_Donation"
# Numerical columns to check
numerical_columns <- c("ClassGrade", "Ageyears", "Height_cm" ,"Footlength_cm","Armspan_cm" ,"Languages_spoken","Travel_time_to_School" ,"Reaction_time" ,"Score_in_memory_game" ,"Importance_reducing_pollution" ,"Importance_recycling_rubbish" ,"Importance_conserving_water","Importance_saving_energy" ,"Importance_owning_computer" ,"Importance_Internet_access" , "Left_Footlength_cm" ,"Index_Fingerlength_mm","Ring_Fingerlength_mm" ,"Sleep_Hours_Schoolnight","Sleep_Hours_Non_Schoolnight","Home_Occupants","Text_Messages_Sent_Yesterday", "Text_Messages_Received_Yesterday","Hanging_Out_With_Friends_Hours" ,"Talking_On_Phone_Hours","Doing_Homework_Hours" ,"Doing_Things_With_Family_Hours","Outdoor_Activities_Hours","Video_Games_Hours","Social_Websites_Hours" , "Texting_Messaging_Hours" , "Computer_Use_Hours" , "Watching_TV_Hours","Paid_Work_Hours" ,"Work_At_Home_Hours")
check_categorical_in_numerical <- function(df, numerical_columns) {
for (col in numerical_columns) {
# Convert the column to character and get unique values
unique_values <- unique(as.character(df[[col]]))
# Identify non-numeric values
non_numeric_values <- unique_values[is.na(as.numeric(unique_values))]
# Count missing values (NA)
missing_values_count <- sum(is.na(df[[col]]))
# Print results
cat("Column '", col, "':\n", sep = "")
if (length(non_numeric_values) > 0) {
cat(" - Contains categorical values:\n")
print(non_numeric_values)
} else {
cat(" - Contains only numeric values.\n")
}
cat(" - Number of missing values (NA):", missing_values_count, "\n\n")
}
}
check_categorical_in_numerical(df, numerical_columns)
## Column 'ClassGrade':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Ageyears':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 1
## Column 'Height_cm':
## - Contains categorical values:
## [1] "5'3" "5'11" "4”10" "5'2" NA "4'9" "5'4" "5,8"
## [9] "4'10" "5’2”" "5'6" "177cm" "5'5" "5´3" "5'3\"" "5' 3\""
## [17] "158cm" "157cm" "172cm" "5' 11"
## - Number of missing values (NA): 13
## Column 'Footlength_cm':
## - Contains categorical values:
## [1] NA "29cm" "24cm" "20cm" "31cm" "12cm"
## - Number of missing values (NA): 20
## Column 'Armspan_cm':
## - Contains categorical values:
## [1] NA "56cm" "189cm" "160cm" "165cm" "71cm"
## - Number of missing values (NA): 40
##
## Column 'Languages_spoken':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 5
## Column 'Travel_time_to_School':
## - Contains categorical values:
## [1] NA "3 min" "7 ish" "21 m" "1hr" "5 min" "12min" "Oof" "14min"
## [10] "20-25" "1hour"
## - Number of missing values (NA): 5
##
## Column 'Reaction_time':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 16
## Column 'Score_in_memory_game':
## - Contains categorical values:
## [1] NA "52 s"
## - Number of missing values (NA): 22
##
## Column 'Importance_reducing_pollution':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 21
##
## Column 'Importance_recycling_rubbish':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 21
## Column 'Importance_conserving_water':
## - Contains categorical values:
## [1] NA "?"
## - Number of missing values (NA): 21
##
## Column 'Importance_saving_energy':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 46
##
## Column 'Importance_owning_computer':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 21
##
## Column 'Importance_Internet_access':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 21
## Column 'Left_Footlength_cm':
## - Contains categorical values:
## [1] NA "8cm" "12cm"
## - Number of missing values (NA): 19
## Column 'Index_Fingerlength_mm':
## - Contains categorical values:
## [1] NA "30mm" "idk" "8cm" "80mm" "70mm" "3'4"
## - Number of missing values (NA): 37
## Column 'Ring_Fingerlength_mm':
## - Contains categorical values:
## [1] NA "25mm" "idk" "7cm" "60 mm" "3'2"
## - Number of missing values (NA): 32
## Column 'Sleep_Hours_Schoolnight':
## - Contains categorical values:
## [1] NA "6hr" "6hour" "7 1/2"
## - Number of missing values (NA): 9
## Column 'Sleep_Hours_Non_Schoolnight':
## - Contains categorical values:
## [1] NA "6~7" "less" "yes" "8hour"
## - Number of missing values (NA): 9
##
## Column 'Home_Occupants':
## - Contains categorical values:
## [1] NA
## - Number of missing values (NA): 11
## Column 'Text_Messages_Sent_Yesterday':
## - Contains categorical values:
## [1] NA "a lot" "40ish" "A lot" "200`" "yes" "n/a" "alot" "30-40"
## [10] "50ish" "30-50" "Idk"
## - Number of missing values (NA): 14
## Column 'Text_Messages_Received_Yesterday':
## - Contains categorical values:
## [1] NA "5~6" "1000s" "idk" "35ish" "A lot" "Idk" "n/a" "alot"
## [10] "30-40" "40-60" "?"
## - Number of missing values (NA): 14
## Column 'Hanging_Out_With_Friends_Hours':
## - Contains categorical values:
## [1] NA "5~6hr" "4hr" "4hour" "2hrs" "Idk"
## - Number of missing values (NA): 22
## Column 'Talking_On_Phone_Hours':
## - Contains categorical values:
## [1] NA "2hr" "<1" "1hr" "5 %" "yes" "2hour" "8mins" "Idk"
## - Number of missing values (NA): 27
## Column 'Doing_Homework_Hours':
## - Contains categorical values:
## [1] NA "2 hr" "1hr" "24/7" "2hour" "2hr" "<1" "Idk"
## - Number of missing values (NA): 27
## Column 'Doing_Things_With_Family_Hours':
## - Contains categorical values:
## [1] NA "1 hr" "6hr" "24/7" "5hour" "6hrs" "Idk"
## - Number of missing values (NA): 30
## Column 'Outdoor_Activities_Hours':
## - Contains categorical values:
## [1] NA "1 hr" "30min" "1hour" "6hrs" "Idk" "P"
## - Number of missing values (NA): 30
## Column 'Video_Games_Hours':
## - Contains categorical values:
## [1] NA "1 hr" "15min" "yes" "10min" "7hrs" "Idk"
## - Number of missing values (NA): 35
## Column 'Social_Websites_Hours':
## - Contains categorical values:
## [1] NA "1 hr" "5hr" "24/7" "1hour" "14hrs" "<1" "Idk"
## - Number of missing values (NA): 34
## Column 'Texting_Messaging_Hours':
## - Contains categorical values:
## [1] NA "2 hr" "30min" "24/7" "15min" "20hrs" "Idk"
## - Number of missing values (NA): 37
## Column 'Computer_Use_Hours':
## - Contains categorical values:
## [1] NA "24/7" "1 hr" "4hr" "yes" ";" "1hour" "3hrs" "Idk"
## - Number of missing values (NA): 38
## Column 'Watching_TV_Hours':
## - Contains categorical values:
## [1] NA "24/7" "2 hr" "<1" "3hr" "yes" "30min" "1hr" "Idk"
## - Number of missing values (NA): 39
## Column 'Paid_Work_Hours':
## - Contains categorical values:
## [1] NA "8hr" "n/a" "7hour" "<1" "35-40" "Idk"
## - Number of missing values (NA): 40
## Column 'Work_At_Home_Hours':
## - Contains categorical values:
## [1] NA "1 hr" "30min" "n/a" "6hour" "20hrs" "<1" "Idk" "3 1/2"
## - Number of missing values (NA): 36
num_data <- df %>%
select(numerical_columns)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(numerical_columns)
##
## # Now:
## data %>% select(all_of(numerical_columns))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gg_miss_var(num_data)
vis_miss(num_data)
ggplot(df, aes(y = Ageyears)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Boxplot of Ageyears", y = "Values")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplot(df, aes(y = Height_cm)) +
geom_boxplot(fill = "darkseagreen") +
labs(title = "Boxplot of Height_cm", y = "Values")
unique(df$Height_cm)
## [1] "182" "70" "5'3"
## [4] "172" "165" "155"
## [7] "169" "160" "190"
## [10] "167" "5'11" "170.2"
## [13] "149" "167.6" "175"
## [16] "175.2" "154" "4”10"
## [19] "183" "4.1100000000000003" "181"
## [22] "152.4" "145" "180"
## [25] "5'2" "164" "162"
## [28] "157" "182.9" "176"
## [31] "163" "170" "178"
## [34] "158" "161" NA
## [37] "143" "154.9" "176.5"
## [40] "132" "182.8" "174"
## [43] "173" "152" "157.5"
## [46] "63" "4'9" "5'4"
## [49] "150.30000000000001" "62" "193"
## [52] "177.8" "5,8" "150"
## [55] "156" "159" "153"
## [58] "187.9" "175.3" "177"
## [61] "124" "4'10" "165.1"
## [64] "187" "147" "163.4"
## [67] "150.5" "181.6" "5"
## [70] "146" "168" "134"
## [73] "191.5" "146.5" "158.19999999999999"
## [76] "166" "184" "189.2"
## [79] "187.5" "68" "149.9"
## [82] "5’2”" "200" "171"
## [85] "5'6" "320" "129.5"
## [88] "13.3" "172.7" "192"
## [91] "144" "177cm" "74"
## [94] "45788" "5'5" "196"
## [97] "120" "6" "190.5"
## [100] "5´3" "172.5" "5'3\""
## [103] "154.30000000000001" "179" "180.3"
## [106] "67" "170.1" "157.4"
## [109] "169.5" "176.7" "5' 3\""
## [112] "158cm" "56" "157cm"
## [115] "160.19999999999999" "151" "5.7"
## [118] "138" "61" "148"
## [121] "170.5" "162.5" "5.3"
## [124] "164.5" "141" "64"
## [127] "142" "52" "134.9"
## [130] "139" "5.6" "22.86"
## [133] "172cm" "185" "188"
## [136] "156.19999999999999" "12" "65"
## [139] "5' 11" "69" "64.400000000000006"
_ Some variables have inconsistencies
# Define function to convert height format to numeric
convert_height <- function(height) {
height <- tolower(height) # Convert to lowercase for consistency
# Handle different formats
height <- str_replace_all(height, "cm", "") # Remove 'cm'
height <- str_replace_all(height, "[^0-9.]", " ") # Keep only numbers and periods
# Convert to numeric, suppress warnings
as.numeric(str_trim(height))
}
# Function to convert time responses to numeric
convert_time <- function(time) {
time <- tolower(time) # Convert to lowercase
time <- str_replace_all(time, "min", "") # Remove 'min'
time <- str_replace_all(time, "hour", "") # Remove 'hour'
time <- str_replace_all(time, "[^0-9.]", " ") # Keep only numbers
as.numeric(str_trim(time))
}
# Function to clean categorical responses in numeric columns
clean_numeric_column <- function(column) {
column <- tolower(column) # Convert to lowercase
column <- str_replace_all(column, "[^0-9.]", " ") # Remove non-numeric characters
column <- as.numeric(str_trim(column)) # Convert to numeric
return(column)
}
# Apply cleaning functions to specific columns
df_new <- df %>%
mutate(
Height_cm = convert_height(Height_cm),
Footlength_cm = convert_height(Footlength_cm),
Armspan_cm = convert_height(Armspan_cm),
Travel_time_to_School = convert_time(Travel_time_to_School),
Reaction_time = clean_numeric_column(Reaction_time),
Score_in_memory_game = clean_numeric_column(Score_in_memory_game),
Importance_reducing_pollution = clean_numeric_column(Importance_reducing_pollution),
Importance_recycling_rubbish = clean_numeric_column(Importance_recycling_rubbish),
Importance_conserving_water = clean_numeric_column(Importance_conserving_water),
Importance_saving_energy = clean_numeric_column(Importance_saving_energy),
Importance_owning_computer = clean_numeric_column(Importance_owning_computer),
Importance_Internet_access = clean_numeric_column(Importance_Internet_access),
Left_Footlength_cm = convert_height(Left_Footlength_cm),
Index_Fingerlength_mm = clean_numeric_column(Index_Fingerlength_mm),
Ring_Fingerlength_mm = clean_numeric_column(Ring_Fingerlength_mm),
Sleep_Hours_Schoolnight = clean_numeric_column(Sleep_Hours_Schoolnight),
Sleep_Hours_Non_Schoolnight = clean_numeric_column(Sleep_Hours_Non_Schoolnight),
Home_Occupants = clean_numeric_column(Home_Occupants),
Text_Messages_Sent_Yesterday = clean_numeric_column(Text_Messages_Sent_Yesterday),
Text_Messages_Received_Yesterday = clean_numeric_column(Text_Messages_Received_Yesterday),
Hanging_Out_With_Friends_Hours = clean_numeric_column(Hanging_Out_With_Friends_Hours),
Talking_On_Phone_Hours = clean_numeric_column(Talking_On_Phone_Hours),
Doing_Homework_Hours = clean_numeric_column(Doing_Homework_Hours),
Doing_Things_With_Family_Hours = clean_numeric_column(Doing_Things_With_Family_Hours),
Outdoor_Activities_Hours = clean_numeric_column(Outdoor_Activities_Hours),
Video_Games_Hours = clean_numeric_column(Video_Games_Hours),
Social_Websites_Hours = clean_numeric_column(Social_Websites_Hours),
Texting_Messaging_Hours = clean_numeric_column(Texting_Messaging_Hours),
Computer_Use_Hours = clean_numeric_column(Computer_Use_Hours),
Watching_TV_Hours = clean_numeric_column(Watching_TV_Hours),
Paid_Work_Hours = clean_numeric_column(Paid_Work_Hours),
Work_At_Home_Hours = clean_numeric_column(Work_At_Home_Hours)
)
# Print summary to check if the columns are now numeric
str(df_new)
## 'data.frame': 452 obs. of 60 variables:
## $ Country : chr "USA" "USA" "USA" "USA" ...
## $ Region : chr "GA" "NE" "SC" "TX" ...
## $ DataYear : num 2022 2017 2020 2023 2014 ...
## $ ClassGrade : num 12 10 12 12 12 8 11 12 10 12 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Ageyears : num 17 16 18 17 17 11 16 17 15 16 ...
## $ Handed : chr "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
## $ Height_cm : num 182 70 NA 172 165 155 169 160 190 167 ...
## $ Footlength_cm : num 33 32 NA 26 25 25.6 24 22 30 23 ...
## $ Armspan_cm : num 193 170 NA 177 168 155 50 159 192 69 ...
## $ Languages_spoken : num 1 2 1 1 1 1 2 1 1.5 2 ...
## $ Travel_to_School : chr "Car" "Car" "Car" "Car" ...
## $ Travel_time_to_School : num 10 30 NA 7 10 15 10 7 10 7 ...
## $ Reaction_time : num 0.324 0.34 3.061 0.328 0.501 ...
## $ Score_in_memory_game : num 37 50 39 35 40 61 32 33 30 30 ...
## $ Favourite_physical_activity : chr "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
## $ Importance_reducing_pollution : num NA 900 299 650 800 572 900 450 200 791 ...
## $ Importance_recycling_rubbish : num NA 900 1000 600 900 583 500 450 100 503 ...
## $ Importance_conserving_water : num NA 900 326 450 850 620 300 450 400 834 ...
## $ Importance_saving_energy : num NA 800 487 500 800 695 300 450 750 856 ...
## $ Importance_owning_computer : num NA 500 0 200 800 754 800 1000 900 1000 ...
## $ Importance_Internet_access : num NA 600 1000 500 800 856 800 1000 900 1000 ...
## $ Left_Footlength_cm : num 30 31 NA 26 25 24.2 24 22 26 23 ...
## $ Longer_foot : chr "Right foot" "Right foot" "Same length" "Right foot" ...
## $ Index_Fingerlength_mm : num 100 40 NA 7 75 67 77 80 82 70 ...
## $ Ring_Fingerlength_mm : num 105 50 NA 7 75 67 69 78 80 80 ...
## $ Longer_Finger_Lefthand : chr "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
## $ Birth_month : chr "January" "September" "September" "February" ...
## $ Favorite_Season : chr "Winter" "Summer" "Spring" "Spring" ...
## $ Allergies : chr "No" "Yes" "No" "Yes" ...
## $ Vegetarian : chr "No" "No" "No" "No" ...
## $ Favorite_Food : chr "Meat" "Meat" "Seafood" "Poultry" ...
## $ Beverage : chr "Water" "Water" "Water" "Water" ...
## $ Favorite_School_Subject : chr "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
## $ Sleep_Hours_Schoolnight : num 8 7 NA 7 8 7 6 6.5 6 6 ...
## $ Sleep_Hours_Non_Schoolnight : num 6 9 NA 9 11 9 8 9 11 6 ...
## $ Home_Occupants : num 3 5 4 6 8 4 4 6 2 4 ...
## $ Home_Internet_Access : chr "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
## $ Communication_With_Friends : chr "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
## $ Text_Messages_Sent_Yesterday : num 260 20 4 15 7 0 1000 4 20 10 ...
## $ Text_Messages_Received_Yesterday: num 274 30 2 15 15 ...
## $ Hanging_Out_With_Friends_Hours : num 10 48 NA 10 2 10 20 2 10 35 ...
## $ Talking_On_Phone_Hours : num 2 2 NA 1 1 2 20 0 1 2 ...
## $ Doing_Homework_Hours : num 12 2 NA 0 9 20 10 6 4 10 ...
## $ Doing_Things_With_Family_Hours : num 2 48 NA 1 7 35 0 10 20 24 ...
## $ Outdoor_Activities_Hours : num 18 14 NA 0 11 27 0 0 20 12 ...
## $ Video_Games_Hours : num 3 2 NA 0 0 30 25 14 15 20 ...
## $ Social_Websites_Hours : num 1 6 NA 3 20 0 16 7 15 4 ...
## $ Texting_Messaging_Hours : num 4 1 NA 20 1 1 24 2 1 3 ...
## $ Computer_Use_Hours : num 10 3 NA 50 21 25 30 21 25 39 ...
## $ Watching_TV_Hours : num 4 20 NA 5 11 7 5 14 5 4 ...
## $ Paid_Work_Hours : num 15 0 NA 15 15 0 4 0 0 0 ...
## $ Work_At_Home_Hours : num 0.5 4 NA 7 2 0 2 6 4 0 ...
## $ Schoolwork_Pressure : chr "Very little" "Some" "A lot" "Some" ...
## $ Planned_Education_Level : chr "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
## $ Favorite_Music : chr "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
## $ Superpower : chr "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
## $ Preferred_Status : chr "Happy" "Happy" "Happy" "Happy" ...
## $ Role_Model_Type : chr "Business person" "Relative" "Relative" "Friend" ...
## $ Charity_Donation : chr "Religious" "Health" "International aid" "International aid" ...
summary(df_new)
## Country Region DataYear ClassGrade
## Length:452 Length:452 Min. :2010 Min. : 4.00
## Class :character Class :character 1st Qu.:2016 1st Qu.:10.00
## Mode :character Mode :character Median :2018 Median :12.00
## Mean :2018 Mean :10.54
## 3rd Qu.:2021 3rd Qu.:12.00
## Max. :2024 Max. :12.00
##
## Gender Ageyears Handed Height_cm
## Length:452 Min. :10.0 Length:452 Min. : 4.11
## Class :character 1st Qu.:15.0 Class :character 1st Qu.: 155.25
## Mode :character Median :17.0 Mode :character Median : 165.10
## Mean :16.1 Mean : 267.83
## 3rd Qu.:17.0 3rd Qu.: 176.00
## Max. :63.0 Max. :45788.00
## NA's :1 NA's :30
## Footlength_cm Armspan_cm Languages_spoken Travel_to_School
## Min. : 0.25 Min. : 1.2 Min. :1.000 Length:452
## 1st Qu.: 22.77 1st Qu.:150.0 1st Qu.:1.000 Class :character
## Median : 24.00 Median :163.0 Median :1.000 Mode :character
## Mean : 27.22 Mean :152.0 Mean :1.511
## 3rd Qu.: 26.70 3rd Qu.:175.3 3rd Qu.:2.000
## Max. :274.00 Max. :416.2 Max. :7.000
## NA's :20 NA's :40 NA's :5
## Travel_time_to_School Reaction_time Score_in_memory_game
## Min. : 0.0 Min. : 0.071 Min. : 0.359
## 1st Qu.: 7.0 1st Qu.: 0.338 1st Qu.: 36.000
## Median : 10.0 Median : 0.400 Median : 42.000
## Mean : 220.6 Mean : 77.743 Mean : 43.743
## 3rd Qu.: 20.0 3rd Qu.: 0.520 3rd Qu.: 49.750
## Max. :45787.0 Max. :31402.000 Max. :121.000
## NA's :7 NA's :16 NA's :22
## Favourite_physical_activity Importance_reducing_pollution
## Length:452 Min. : 0.0
## Class :character 1st Qu.: 500.0
## Mode :character Median : 743.0
## Mean : 707.9
## 3rd Qu.: 979.5
## Max. :10000.0
## NA's :21
## Importance_recycling_rubbish Importance_conserving_water
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 487.0 1st Qu.: 400.0
## Median : 700.0 Median : 645.0
## Mean : 719.6 Mean : 636.5
## 3rd Qu.: 900.0 3rd Qu.: 900.0
## Max. :10000.0 Max. :10000.0
## NA's :21 NA's :22
## Importance_saving_energy Importance_owning_computer Importance_Internet_access
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 403.0 1st Qu.: 200.0 1st Qu.: 500.0
## Median : 700.0 Median : 513.0 Median : 800.0
## Mean : 678.9 Mean : 557.4 Mean : 778.3
## 3rd Qu.: 900.0 3rd Qu.: 900.0 3rd Qu.: 1000.0
## Max. :10000.0 Max. :7000.0 Max. :10000.0
## NA's :46 NA's :21 NA's :21
## Left_Footlength_cm Longer_foot Index_Fingerlength_mm
## Min. : 0.24 Length:452 Min. : 0.1016
## 1st Qu.: 22.50 Class :character 1st Qu.: 18.8000
## Median : 24.00 Mode :character Median : 70.5000
## Mean : 26.93 Mean : 72.1799
## 3rd Qu.: 26.00 3rd Qu.: 80.0000
## Max. :335.00 Max. :1500.0000
## NA's :19 NA's :39
## Ring_Fingerlength_mm Longer_Finger_Lefthand Birth_month
## Min. : 0.1016 Length:452 Length:452
## 1st Qu.: 20.2750 Class :character Class :character
## Median : 70.7500 Mode :character Mode :character
## Mean : 72.7494
## 3rd Qu.: 80.0000
## Max. :1700.0000
## NA's :34
## Favorite_Season Allergies Vegetarian Favorite_Food
## Length:452 Length:452 Length:452 Length:452
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Beverage Favorite_School_Subject Sleep_Hours_Schoolnight
## Length:452 Length:452 Min. : 2
## Class :character Class :character 1st Qu.: 6
## Mode :character Mode :character Median : 7
## Mean : 1043
## 3rd Qu.: 8
## Max. :45847
## NA's :10
## Sleep_Hours_Non_Schoolnight Home_Occupants Home_Internet_Access
## Min. : 1 Min. : 1.0 Length:452
## 1st Qu.: 8 1st Qu.: 3.0 Class :character
## Median : 9 Median : 4.0 Mode :character
## Mean : 1156 Mean : 108.1
## 3rd Qu.: 10 3rd Qu.: 5.0
## Max. :46004 Max. :45752.0
## NA's :12 NA's :11
## Communication_With_Friends Text_Messages_Sent_Yesterday
## Length:452 Min. : 0.00
## Class :character 1st Qu.: 7.25
## Mode :character Median : 30.00
## Mean : 83.74
## 3rd Qu.: 100.00
## Max. :2000.00
## NA's :22
## Text_Messages_Received_Yesterday Hanging_Out_With_Friends_Hours
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 10.75 1st Qu.: 4.0
## Median : 35.00 Median : 9.5
## Mean : 133.50 Mean : 228.2
## 3rd Qu.: 108.25 3rd Qu.: 18.0
## Max. :10000.00 Max. :45846.0
## NA's :24 NA's :24
## Talking_On_Phone_Hours Doing_Homework_Hours Doing_Things_With_Family_Hours
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 1.0 1st Qu.: 2.0 1st Qu.: 3.0
## Median : 1.5 Median : 6.0 Median : 5.0
## Mean : 113.6 Mean : 117.4 Mean : 228.4
## 3rd Qu.: 4.0 3rd Qu.: 10.0 3rd Qu.: 12.0
## Max. :45878.0 Max. :45721.0 Max. :45910.0
## NA's :29 NA's :29 NA's :32
## Outdoor_Activities_Hours Video_Games_Hours Social_Websites_Hours
## Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.: 2.00
## Median : 5.00 Median : 1.000 Median : 5.00
## Mean : 226.05 Mean : 5.087 Mean : 11.71
## 3rd Qu.: 10.12 3rd Qu.: 6.500 3rd Qu.: 12.00
## Max. :45879.00 Max. :126.000 Max. :168.00
## NA's :32 NA's :37 NA's :36
## Texting_Messaging_Hours Computer_Use_Hours Watching_TV_Hours Paid_Work_Hours
## Min. : 0.0 Min. : 0 Min. : 0.0 Min. : 0.000
## 1st Qu.: 2.0 1st Qu.: 4 1st Qu.: 1.0 1st Qu.: 0.000
## Median : 4.0 Median : 10 Median : 4.0 Median : 0.000
## Mean : 148.3 Mean : 158 Mean : 230.7 Mean : 6.513
## 3rd Qu.: 14.0 3rd Qu.: 25 3rd Qu.: 9.0 3rd Qu.:10.000
## Max. :45720.0 Max. :45692 Max. :45945.0 Max. :60.000
## NA's :39 NA's :44 NA's :42 NA's :43
## Work_At_Home_Hours Schoolwork_Pressure Planned_Education_Level
## Min. : 0.000 Length:452 Length:452
## 1st Qu.: 1.000 Class :character Class :character
## Median : 3.000 Mode :character Mode :character
## Mean : 5.261
## 3rd Qu.: 5.000
## Max. :76.000
## NA's :39
## Favorite_Music Superpower Preferred_Status Role_Model_Type
## Length:452 Length:452 Length:452 Length:452
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Charity_Donation
## Length:452
## Class :character
## Mode :character
##
##
##
##
vis_miss(df_new)
impute_median <- function(df, numerical_columns) {
for (col in numerical_columns) {
if (any(is.na(df[[col]]))) { # Check if the column has missing values
median_value <- median(df[[col]], na.rm = TRUE) # Calculate median
df[[col]][is.na(df[[col]])] <- median_value # Replace missing values with median
cat("Imputed missing values in column '", col, "' with median: ", median_value, "\n", sep = "")
}
}
return(df)
}
df_new_1<- impute_median(df_new,numerical_columns )
## Imputed missing values in column 'Ageyears' with median: 17
## Imputed missing values in column 'Height_cm' with median: 165.1
## Imputed missing values in column 'Footlength_cm' with median: 24
## Imputed missing values in column 'Armspan_cm' with median: 163
## Imputed missing values in column 'Languages_spoken' with median: 1
## Imputed missing values in column 'Travel_time_to_School' with median: 10
## Imputed missing values in column 'Reaction_time' with median: 0.3995
## Imputed missing values in column 'Score_in_memory_game' with median: 42
## Imputed missing values in column 'Importance_reducing_pollution' with median: 743
## Imputed missing values in column 'Importance_recycling_rubbish' with median: 700
## Imputed missing values in column 'Importance_conserving_water' with median: 645
## Imputed missing values in column 'Importance_saving_energy' with median: 700
## Imputed missing values in column 'Importance_owning_computer' with median: 513
## Imputed missing values in column 'Importance_Internet_access' with median: 800
## Imputed missing values in column 'Left_Footlength_cm' with median: 24
## Imputed missing values in column 'Index_Fingerlength_mm' with median: 70.5
## Imputed missing values in column 'Ring_Fingerlength_mm' with median: 70.75
## Imputed missing values in column 'Sleep_Hours_Schoolnight' with median: 7
## Imputed missing values in column 'Sleep_Hours_Non_Schoolnight' with median: 9
## Imputed missing values in column 'Home_Occupants' with median: 4
## Imputed missing values in column 'Text_Messages_Sent_Yesterday' with median: 30
## Imputed missing values in column 'Text_Messages_Received_Yesterday' with median: 35
## Imputed missing values in column 'Hanging_Out_With_Friends_Hours' with median: 9.5
## Imputed missing values in column 'Talking_On_Phone_Hours' with median: 1.5
## Imputed missing values in column 'Doing_Homework_Hours' with median: 6
## Imputed missing values in column 'Doing_Things_With_Family_Hours' with median: 5
## Imputed missing values in column 'Outdoor_Activities_Hours' with median: 5
## Imputed missing values in column 'Video_Games_Hours' with median: 1
## Imputed missing values in column 'Social_Websites_Hours' with median: 5
## Imputed missing values in column 'Texting_Messaging_Hours' with median: 4
## Imputed missing values in column 'Computer_Use_Hours' with median: 10
## Imputed missing values in column 'Watching_TV_Hours' with median: 4
## Imputed missing values in column 'Paid_Work_Hours' with median: 0
## Imputed missing values in column 'Work_At_Home_Hours' with median: 3
check_categorical_in_numerical(df_new_1, numerical_columns)
## Column 'ClassGrade':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Ageyears':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Height_cm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Footlength_cm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Armspan_cm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Languages_spoken':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Travel_time_to_School':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Reaction_time':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Score_in_memory_game':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_reducing_pollution':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_recycling_rubbish':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_conserving_water':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_saving_energy':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_owning_computer':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Importance_Internet_access':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Left_Footlength_cm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Index_Fingerlength_mm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Ring_Fingerlength_mm':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Sleep_Hours_Schoolnight':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Sleep_Hours_Non_Schoolnight':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Home_Occupants':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Text_Messages_Sent_Yesterday':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Text_Messages_Received_Yesterday':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Hanging_Out_With_Friends_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Talking_On_Phone_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Doing_Homework_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Doing_Things_With_Family_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Outdoor_Activities_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Video_Games_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Social_Websites_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Texting_Messaging_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Computer_Use_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Watching_TV_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Paid_Work_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
##
## Column 'Work_At_Home_Hours':
## - Contains only numeric values.
## - Number of missing values (NA): 0
gg_miss_var(df_new_1)
vis_miss(df_new_1)
handle_outliers <- function(df, numerical_columns, method = "remove", iqr_multiplier = 1.5, z_threshold = 3) {
# Function to handle outliers in numerical variables
# Arguments:
# df: Data frame containing the data
# numerical_columns: Vector of column names (strings) to process
# method: "remove" (default), "impute_mean", "impute_median", or "cap"
# iqr_multiplier: Multiplier for IQR method (default = 1.5)
# z_threshold: Threshold for Z-score method (default = 3)
# Loop through each numerical column
for (col in numerical_columns) {
# Extract the column data
data <- df[[col]]
# Detect outliers using the IQR method
Q1 <- quantile(data, 0.25, na.rm = TRUE)
Q3 <- quantile(data, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - iqr_multiplier * IQR
upper_bound <- Q3 + iqr_multiplier * IQR
# Detect outliers using the Z-score method
z_scores <- scale(data)
z_outliers <- abs(z_scores) > z_threshold
# Combine IQR and Z-score outliers
outliers <- data < lower_bound | data > upper_bound | z_outliers
# Handle outliers based on the specified method
if (method == "remove") {
# Remove rows with outliers
df <- df[!outliers, ]
cat("Outliers removed from column:", col, "\n")
} else if (method == "impute_mean") {
# Impute outliers with the mean
mean_value <- mean(data, na.rm = TRUE)
df[[col]][outliers] <- mean_value
cat("Outliers in column", col, "imputed with mean:", mean_value, "\n")
} else if (method == "impute_median") {
# Impute outliers with the median
median_value <- median(data, na.rm = TRUE)
df[[col]][outliers] <- median_value
cat("Outliers in column", col, "imputed with median:", median_value, "\n")
} else if (method == "cap") {
# Cap outliers at the lower and upper bounds
df[[col]][data < lower_bound] <- lower_bound
df[[col]][data > upper_bound] <- upper_bound
cat("Outliers in column", col, "capped at bounds:", lower_bound, "and", upper_bound, "\n")
} else {
stop("Invalid method. Choose 'remove', 'impute_mean', 'impute_median', or 'cap'.")
}
}
return(df)
}
df_capped <- handle_outliers(df_new_1, numerical_columns, method = "cap")
## Outliers in column ClassGrade capped at bounds: 7 and 15
## Outliers in column Ageyears capped at bounds: 12 and 20
## Outliers in column Height_cm capped at bounds: 127.575 and 203.775
## Outliers in column Footlength_cm capped at bounds: 17.75 and 31.75
## Outliers in column Armspan_cm capped at bounds: 117.5 and 209.5
## Outliers in column Languages_spoken capped at bounds: -0.5 and 3.5
## Outliers in column Travel_time_to_School capped at bounds: -12.5 and 39.5
## Outliers in column Reaction_time capped at bounds: 0.08175 and 0.76775
## Outliers in column Score_in_memory_game capped at bounds: 16.5 and 68.5
## Outliers in column Importance_reducing_pollution capped at bounds: -175 and 1625
## Outliers in column Importance_recycling_rubbish capped at bounds: -100 and 1500
## Outliers in column Importance_conserving_water capped at bounds: -350 and 1650
## Outliers in column Importance_saving_energy capped at bounds: -166.875 and 1540.125
## Outliers in column Importance_owning_computer capped at bounds: -850 and 1950
## Outliers in column Importance_Internet_access capped at bounds: -250 and 1750
## Outliers in column Left_Footlength_cm capped at bounds: 17.625 and 31.025
## Outliers in column Index_Fingerlength_mm capped at bounds: -22.5 and 141.5
## Outliers in column Ring_Fingerlength_mm capped at bounds: -32.5 and 147.5
## Outliers in column Sleep_Hours_Schoolnight capped at bounds: 3 and 11
## Outliers in column Sleep_Hours_Non_Schoolnight capped at bounds: 5 and 13
## Outliers in column Home_Occupants capped at bounds: 2.5 and 6.5
## Outliers in column Text_Messages_Sent_Yesterday capped at bounds: -109.75 and 206.25
## Outliers in column Text_Messages_Received_Yesterday capped at bounds: -120 and 232
## Outliers in column Hanging_Out_With_Friends_Hours capped at bounds: -12.875 and 32.125
## Outliers in column Talking_On_Phone_Hours capped at bounds: -3.5 and 8.5
## Outliers in column Doing_Homework_Hours capped at bounds: -10 and 22
## Outliers in column Doing_Things_With_Family_Hours capped at bounds: -7.5 and 20.5
## Outliers in column Outdoor_Activities_Hours capped at bounds: -10 and 22
## Outliers in column Video_Games_Hours capped at bounds: -9 and 15
## Outliers in column Social_Websites_Hours capped at bounds: -11.5 and 24.5
## Outliers in column Texting_Messaging_Hours capped at bounds: -13 and 27
## Outliers in column Computer_Use_Hours capped at bounds: -22.375 and 50.625
## Outliers in column Watching_TV_Hours capped at bounds: -7 and 17
## Outliers in column Paid_Work_Hours capped at bounds: -12 and 20
## Outliers in column Work_At_Home_Hours capped at bounds: -5 and 11
str(df_capped)
## 'data.frame': 452 obs. of 60 variables:
## $ Country : chr "USA" "USA" "USA" "USA" ...
## $ Region : chr "GA" "NE" "SC" "TX" ...
## $ DataYear : num 2022 2017 2020 2023 2014 ...
## $ ClassGrade : num 12 10 12 12 12 8 11 12 10 12 ...
## $ Gender : chr "Male" "Male" "Female" "Female" ...
## $ Ageyears : num 17 16 18 17 17 12 16 17 15 16 ...
## $ Handed : chr "Right-Handed" "Right-Handed" "Right-Handed" "Right-Handed" ...
## $ Height_cm : num 182 128 165 172 165 ...
## $ Footlength_cm : num 31.8 31.8 24 26 25 ...
## $ Armspan_cm : num 193 170 163 177 168 ...
## $ Languages_spoken : num 1 2 1 1 1 1 2 1 1.5 2 ...
## $ Travel_to_School : chr "Car" "Car" "Car" "Car" ...
## $ Travel_time_to_School : num 10 30 10 7 10 15 10 7 10 7 ...
## $ Reaction_time : num 0.324 0.34 0.768 0.328 0.501 ...
## $ Score_in_memory_game : num 37 50 39 35 40 61 32 33 30 30 ...
## $ Favourite_physical_activity : chr "Lacrosse" "Baseball/Softball" "Athletics" "Other" ...
## $ Importance_reducing_pollution : num 743 900 299 650 800 572 900 450 200 791 ...
## $ Importance_recycling_rubbish : num 700 900 1000 600 900 583 500 450 100 503 ...
## $ Importance_conserving_water : num 645 900 326 450 850 620 300 450 400 834 ...
## $ Importance_saving_energy : num 700 800 487 500 800 695 300 450 750 856 ...
## $ Importance_owning_computer : num 513 500 0 200 800 754 800 1000 900 1000 ...
## $ Importance_Internet_access : num 800 600 1000 500 800 856 800 1000 900 1000 ...
## $ Left_Footlength_cm : num 30 31 24 26 25 24.2 24 22 26 23 ...
## $ Longer_foot : chr "Right foot" "Right foot" "Same length" "Right foot" ...
## $ Index_Fingerlength_mm : num 100 40 70.5 7 75 67 77 80 82 70 ...
## $ Ring_Fingerlength_mm : num 105 50 70.8 7 75 ...
## $ Longer_Finger_Lefthand : chr "Ring finger" "Ring finger" "Ring finger" "Index finger" ...
## $ Birth_month : chr "January" "September" "September" "February" ...
## $ Favorite_Season : chr "Winter" "Summer" "Spring" "Spring" ...
## $ Allergies : chr "No" "Yes" "No" "Yes" ...
## $ Vegetarian : chr "No" "No" "No" "No" ...
## $ Favorite_Food : chr "Meat" "Meat" "Seafood" "Poultry" ...
## $ Beverage : chr "Water" "Water" "Water" "Water" ...
## $ Favorite_School_Subject : chr "Mathematics and statistics" "Physical education" "Mathematics and statistics" "Art" ...
## $ Sleep_Hours_Schoolnight : num 8 7 7 7 8 7 6 6.5 6 6 ...
## $ Sleep_Hours_Non_Schoolnight : num 6 9 9 9 11 9 8 9 11 6 ...
## $ Home_Occupants : num 3 5 4 6 6.5 4 4 6 2.5 4 ...
## $ Home_Internet_Access : chr "Yes - other" "Yes - dial-up connection" "Yes - other" "Yes - broadband connection" ...
## $ Communication_With_Friends : chr "Cell phone" "Myspace, Facebook, other social networking sites, or blog" "Text messaging" NA ...
## $ Text_Messages_Sent_Yesterday : num 206 20 4 15 7 ...
## $ Text_Messages_Received_Yesterday: num 232 30 2 15 15 0 232 5 25 11 ...
## $ Hanging_Out_With_Friends_Hours : num 10 32.1 9.5 10 2 ...
## $ Talking_On_Phone_Hours : num 2 2 1.5 1 1 2 8.5 0 1 2 ...
## $ Doing_Homework_Hours : num 12 2 6 0 9 20 10 6 4 10 ...
## $ Doing_Things_With_Family_Hours : num 2 20.5 5 1 7 20.5 0 10 20 20.5 ...
## $ Outdoor_Activities_Hours : num 18 14 5 0 11 22 0 0 20 12 ...
## $ Video_Games_Hours : num 3 2 1 0 0 15 15 14 15 15 ...
## $ Social_Websites_Hours : num 1 6 5 3 20 0 16 7 15 4 ...
## $ Texting_Messaging_Hours : num 4 1 4 20 1 1 24 2 1 3 ...
## $ Computer_Use_Hours : num 10 3 10 50 21 25 30 21 25 39 ...
## $ Watching_TV_Hours : num 4 17 4 5 11 7 5 14 5 4 ...
## $ Paid_Work_Hours : num 15 0 0 15 15 0 4 0 0 0 ...
## $ Work_At_Home_Hours : num 0.5 4 3 7 2 0 2 6 4 0 ...
## $ Schoolwork_Pressure : chr "Very little" "Some" "A lot" "Some" ...
## $ Planned_Education_Level : chr "Graduate degree" "Graduate degree" "Graduate degree" "Other" ...
## $ Favorite_Music : chr "Country" "Pop" "Rap/Hip hop" "Rap/Hip hop" ...
## $ Superpower : chr "Invisibility" "Super strength" "Telepathy" "Telepathy" ...
## $ Preferred_Status : chr "Happy" "Happy" "Happy" "Happy" ...
## $ Role_Model_Type : chr "Business person" "Relative" "Relative" "Friend" ...
## $ Charity_Donation : chr "Religious" "Health" "International aid" "International aid" ...
categorical_variables <- c(
"Country",
"Region",
"ClassGrade",
"Gender",
"Handed",
"Travel_to_School",
"Favourite_physical_activity",
"Longer_foot",
"Longer_Finger_Lefthand",
"Birth_month",
"Favorite_Season",
"Allergies",
"Vegetarian",
"Favorite_Food",
"Beverage",
"Favorite_School_Subject",
"Home_Internet_Access",
"Communication_With_Friends",
"Favorite_Music",
"Superpower",
"Preferred_Status",
"Role_Model_Type",
"Charity_Donation",
"Planned_Education_Level",
"Schoolwork_Pressure"
)
check_cat_missing_values <- function(df, categorical_variables) {
missing_summary <- sapply(df[, categorical_variables], function(col) sum(is.na(col)))
return(missing_summary)
}
# Check missing values
missing_values_cat <- check_cat_missing_values(df_capped, categorical_variables)
print(missing_values_cat)
## Country Region
## 0 0
## ClassGrade Gender
## 0 1
## Handed Travel_to_School
## 1 1
## Favourite_physical_activity Longer_foot
## 1 16
## Longer_Finger_Lefthand Birth_month
## 19 3
## Favorite_Season Allergies
## 2 5
## Vegetarian Favorite_Food
## 7 3
## Beverage Favorite_School_Subject
## 4 5
## Home_Internet_Access Communication_With_Friends
## 12 19
## Favorite_Music Superpower
## 39 35
## Preferred_Status Role_Model_Type
## 34 36
## Charity_Donation Planned_Education_Level
## 34 34
## Schoolwork_Pressure
## 31
calculate_mode <- function(col) {
mode <- names(sort(table(col), decreasing = TRUE))[1]
return(mode)
}
impute_missing_values_cat <- function(df, categorical_variables) {
for (col in categorical_variables) {
cat("Processing column:", col, "\n")
cat("Class of column:", class(df[[col]]), "\n")
if (sum(is.na(df[[col]])) > 0) { # Check if the column has missing values
mode_value <- calculate_mode(df[[col]])
df[[col]][is.na(df[[col]])] <- mode_value # Replace missing values with the mode
cat("Imputed missing values in column '", col, "' with mode: '", mode_value, "'\n", sep = "")
}
}
return(df)
}
df_imputed <- impute_missing_values_cat(df_capped, categorical_variables)
## Processing column: Country
## Class of column: character
## Processing column: Region
## Class of column: character
## Processing column: ClassGrade
## Class of column: numeric
## Processing column: Gender
## Class of column: character
## Imputed missing values in column 'Gender' with mode: 'Female'
## Processing column: Handed
## Class of column: character
## Imputed missing values in column 'Handed' with mode: 'Right-Handed'
## Processing column: Travel_to_School
## Class of column: character
## Imputed missing values in column 'Travel_to_School' with mode: 'Car'
## Processing column: Favourite_physical_activity
## Class of column: character
## Imputed missing values in column 'Favourite_physical_activity' with mode: 'Other'
## Processing column: Longer_foot
## Class of column: character
## Imputed missing values in column 'Longer_foot' with mode: 'Same length'
## Processing column: Longer_Finger_Lefthand
## Class of column: character
## Imputed missing values in column 'Longer_Finger_Lefthand' with mode: 'Ring finger'
## Processing column: Birth_month
## Class of column: character
## Imputed missing values in column 'Birth_month' with mode: 'September'
## Processing column: Favorite_Season
## Class of column: character
## Imputed missing values in column 'Favorite_Season' with mode: 'Summer'
## Processing column: Allergies
## Class of column: character
## Imputed missing values in column 'Allergies' with mode: 'No'
## Processing column: Vegetarian
## Class of column: character
## Imputed missing values in column 'Vegetarian' with mode: 'No'
## Processing column: Favorite_Food
## Class of column: character
## Imputed missing values in column 'Favorite_Food' with mode: 'Meat'
## Processing column: Beverage
## Class of column: character
## Imputed missing values in column 'Beverage' with mode: 'Water'
## Processing column: Favorite_School_Subject
## Class of column: character
## Imputed missing values in column 'Favorite_School_Subject' with mode: 'Mathematics and statistics'
## Processing column: Home_Internet_Access
## Class of column: character
## Imputed missing values in column 'Home_Internet_Access' with mode: 'Yes - broadband connection'
## Processing column: Communication_With_Friends
## Class of column: character
## Imputed missing values in column 'Communication_With_Friends' with mode: 'Text messaging'
## Processing column: Favorite_Music
## Class of column: character
## Imputed missing values in column 'Favorite_Music' with mode: 'Rap/Hip hop'
## Processing column: Superpower
## Class of column: character
## Imputed missing values in column 'Superpower' with mode: 'Telepathy'
## Processing column: Preferred_Status
## Class of column: character
## Imputed missing values in column 'Preferred_Status' with mode: 'Happy'
## Processing column: Role_Model_Type
## Class of column: character
## Imputed missing values in column 'Role_Model_Type' with mode: 'Relative'
## Processing column: Charity_Donation
## Class of column: character
## Imputed missing values in column 'Charity_Donation' with mode: 'Health'
## Processing column: Planned_Education_Level
## Class of column: character
## Imputed missing values in column 'Planned_Education_Level' with mode: 'Graduate degree'
## Processing column: Schoolwork_Pressure
## Class of column: character
## Imputed missing values in column 'Schoolwork_Pressure' with mode: 'Some'
missing_values_after_imputation <- impute_missing_values_cat(df_imputed, categorical_variables)
## Processing column: Country
## Class of column: character
## Processing column: Region
## Class of column: character
## Processing column: ClassGrade
## Class of column: numeric
## Processing column: Gender
## Class of column: character
## Processing column: Handed
## Class of column: character
## Processing column: Travel_to_School
## Class of column: character
## Processing column: Favourite_physical_activity
## Class of column: character
## Processing column: Longer_foot
## Class of column: character
## Processing column: Longer_Finger_Lefthand
## Class of column: character
## Processing column: Birth_month
## Class of column: character
## Processing column: Favorite_Season
## Class of column: character
## Processing column: Allergies
## Class of column: character
## Processing column: Vegetarian
## Class of column: character
## Processing column: Favorite_Food
## Class of column: character
## Processing column: Beverage
## Class of column: character
## Processing column: Favorite_School_Subject
## Class of column: character
## Processing column: Home_Internet_Access
## Class of column: character
## Processing column: Communication_With_Friends
## Class of column: character
## Processing column: Favorite_Music
## Class of column: character
## Processing column: Superpower
## Class of column: character
## Processing column: Preferred_Status
## Class of column: character
## Processing column: Role_Model_Type
## Class of column: character
## Processing column: Charity_Donation
## Class of column: character
## Processing column: Planned_Education_Level
## Class of column: character
## Processing column: Schoolwork_Pressure
## Class of column: character
gg_miss_var(df_imputed)
# Categorical variables in to factors
df_imputed$Region <- as.factor(df_imputed$Region)
df_imputed$Gender <- as.factor(df_imputed$Gender)
df_imputed$Handed <- as.factor(df_imputed$Handed)
df_imputed$Schoolwork_Pressure <- as.factor(df_imputed$Schoolwork_Pressure)
df_imputed$Travel_to_School <- as.factor(df_imputed$Travel_to_School)
df_imputed$Beverage <- as.factor(df_imputed$Beverage)
df_imputed$Home_Internet_Access <- as.factor(df_imputed$Home_Internet_Access )
df_imputed$Favorite_School_Subject <- as.factor(df_imputed$Favorite_School_Subject)
df_imputed$Schoolwork_Pressure <- as.factor(df_imputed$Schoolwork_Pressure)
# Categorical variables in to factors
interest_cat <- c(
"Region",
"Gender",
"Handed",
"Travel_to_School",
"Favourite_physical_activity",
"Beverage",
"Favorite_School_Subject",
"Home_Internet_Access",
"Schoolwork_Pressure")
interest_num <- c("Ageyears","Travel_time_to_School" ,"Reaction_time" ,"Score_in_memory_game" ,"Sleep_Hours_Schoolnight","Sleep_Hours_Non_Schoolnight","Home_Occupants","Text_Messages_Sent_Yesterday", "Text_Messages_Received_Yesterday","Hanging_Out_With_Friends_Hours" ,"Talking_On_Phone_Hours","Doing_Homework_Hours" ,"Doing_Things_With_Family_Hours","Outdoor_Activities_Hours","Video_Games_Hours","Social_Websites_Hours" , "Texting_Messaging_Hours" , "Computer_Use_Hours" , "Watching_TV_Hours","Paid_Work_Hours" ,"Work_At_Home_Hours")
# Subset numerical variables of interest
num_vars <- df_imputed[, c(interest_num)]
# Calculate correlation matrix
cor_matrix <- cor(num_vars, use = "complete.obs")
# Visualize correlation matrix
corrplot(cor_matrix,
method = "circle", # Use circles to represent correlation
tl.cex = 0.8, # Text size
tl.col = "darkorchid") # Change text color to red
df_imputed$Avg_Night_Sleep <- rowMeans(df_imputed[, c("Sleep_Hours_Schoolnight", "Sleep_Hours_Non_Schoolnight")])
df_imputed$Schoolwork_Pressure_Binary <- ifelse(df_imputed$Schoolwork_Pressure %in% c("None", "Very little"), "Low Pressure", "High Pressure")
df_imputed$Schoolwork_Pressure_Binary <- as.factor(df_imputed$Schoolwork_Pressure_Binary)
df_imputed$Importance_of_Technology_score <- rowMeans(df_imputed[, c( "Importance_Internet_access", "Importance_owning_computer" )])
df_imputed$Mean_daily_screen_time<- rowSums(df_imputed[, c("Video_Games_Hours", "Computer_Use_Hours",
"Social_Websites_Hours", "Watching_TV_Hours",
"Texting_Messaging_Hours")]) / 7
df_imputed$Mean_daily_Physical_and_Social_Activity <- rowSums(df_imputed[, c("Outdoor_Activities_Hours" ,"Hanging_Out_With_Friends_Hours" ,"Work_At_Home_Hours","Doing_Things_With_Family_Hours" )])/ 7
ggplot(df_imputed, aes(x = Region)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Region", x = "Region", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Gender)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Gender", x = "Gender", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Handed)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Handed", x = "Handed", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Travel_to_School)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Travel_to_School", x = "Travel_to_School", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Favourite_physical_activity)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Favourite_physical_activity", x = "Favourite_physical_activity", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Beverage)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Beverage", x = "Beverage", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Favorite_School_Subject)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Favorite_School_Subject", x = "Favorite_School_Subject", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Home_Internet_Access)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of Home_Internet_Access", x = "Home_Internet_Access", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
df_imputed$Schoolwork_Pressure <- factor(df_imputed$Schoolwork_Pressure,
levels = c("None", "Very little", "Some", "A lot"))
ggplot(df_imputed, aes(x = Schoolwork_Pressure)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of School Work Pressure", x = "Schoolwork_Pressure", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(df_imputed, aes(x = Schoolwork_Pressure_Binary)) +
geom_bar(fill = "darkseagreen", color = "black") +
theme_minimal() +
labs(title = "Bar Plot of School work_Pressure", x = "Schoolwork_Pressure", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Paid Work Hours
ggplot(df_imputed, aes(x =Paid_Work_Hours )) +
geom_density(fill = "blue", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Paid_Work_Hours", x = "Paid_Work_Hours", y = "Density")
Reaction Time
ggplot(df_imputed, aes(x = Reaction_time)) +
geom_density(fill = "blue", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Reaction Time", x = "Reaction Time", y = "Density")
Score in memory game
ggplot(df_imputed, aes(x = Score_in_memory_game)) +
geom_density(fill = "coral4", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Score_in_memory_game", x = "Score_in_memory_game", y = "Density")
Score in Screen time
ggplot(df_imputed, aes(x = Mean_daily_screen_time)) +
geom_density(fill = "royalblue4", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Mean_daily_screen_time", x = "Mean_daily_screen_time", y = "Density")
Avearge night sleep
ggplot(df_imputed, aes(x = Avg_Night_Sleep)) +
geom_density(fill = "limegreen", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Avg_Night_Sleep", x = "Avg_Night_Sleep", y = "Density")
Density Plot of Physical_and_Social_Activity
ggplot(df_imputed, aes(x = Mean_daily_Physical_and_Social_Activity)) +
geom_density(fill = "darkkhaki", alpha = 0.4) +
theme_minimal() +
labs(title = "Density Plot of Mean_daily_Physical_and_Social_Activity", x = "Mean_daily_Physical_and_Social_Activity", y = "Density")
boxplot(Reaction_time ~ Schoolwork_Pressure_Binary, data = df_imputed,
main = "Boxplot of Reaction_time by Schoolwork_Pressure",
xlab = "Schoolwork_PressureColumn", ylab = "Reaction_time",
col = "maroon")
boxplot(Score_in_memory_game ~ Schoolwork_Pressure_Binary, data = df_imputed,
main = "Boxplot of Score_in_memory_game by Schoolwork_Pressure",
xlab = "Schoolwork_PressureColumn", ylab = "Score_in_memory_game",
col = "lightblue")
boxplot(Mean_daily_screen_time~ Schoolwork_Pressure_Binary, data = df_imputed,
main = "Boxplot of Mean_daily_screen_time by Schoolwork_Pressure",
xlab = "Schoolwork_PressureColumn", ylab = "Mean_daily_screen_time",
col = "limegreen")
boxplot(Mean_daily_Physical_and_Social_Activity~ Schoolwork_Pressure_Binary, data = df_imputed,
main = "Boxplot of Mean_daily_Physical_and_Social_Activity by Schoolwork_Pressure",
xlab = "Schoolwork_PressureColumn", ylab = "Mean_daily_Physical_and_Social_Activity",
col = "powderblue")
df_selected <- df_imputed %>%
select(Schoolwork_Pressure_Binary, Mean_daily_screen_time, Mean_daily_Physical_and_Social_Activity
,
Home_Occupants, Avg_Night_Sleep)
full_model <- glm(Schoolwork_Pressure_Binary ~ ., data = df_selected , family = binomial)
summary(full_model)
##
## Call:
## glm(formula = Schoolwork_Pressure_Binary ~ ., family = binomial,
## data = df_selected)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.578662 0.950302 -2.714 0.00666
## Mean_daily_screen_time -0.060654 0.033830 -1.793 0.07298
## Mean_daily_Physical_and_Social_Activity 0.148522 0.051388 2.890 0.00385
## Home_Occupants 0.141829 0.115378 1.229 0.21898
## Avg_Night_Sleep 0.003684 0.096529 0.038 0.96956
##
## (Intercept) **
## Mean_daily_screen_time .
## Mean_daily_Physical_and_Social_Activity **
## Home_Occupants
## Avg_Night_Sleep
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.45 on 451 degrees of freedom
## Residual deviance: 398.19 on 447 degrees of freedom
## AIC: 408.19
##
## Number of Fisher Scoring iterations: 4
stepwise_model <- step(full_model, direction = "both")
## Start: AIC=408.19
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity +
## Home_Occupants + Avg_Night_Sleep
##
## Df Deviance AIC
## - Avg_Night_Sleep 1 398.20 406.20
## - Home_Occupants 1 399.70 407.70
## <none> 398.19 408.19
## - Mean_daily_screen_time 1 401.62 409.62
## - Mean_daily_Physical_and_Social_Activity 1 406.41 414.41
##
## Step: AIC=406.2
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity +
## Home_Occupants
##
## Df Deviance AIC
## - Home_Occupants 1 399.70 405.70
## <none> 398.20 406.20
## - Mean_daily_screen_time 1 401.64 407.64
## + Avg_Night_Sleep 1 398.19 408.19
## - Mean_daily_Physical_and_Social_Activity 1 406.41 412.41
##
## Step: AIC=405.7
## Schoolwork_Pressure_Binary ~ Mean_daily_screen_time + Mean_daily_Physical_and_Social_Activity
##
## Df Deviance AIC
## <none> 399.70 405.70
## + Home_Occupants 1 398.20 406.20
## - Mean_daily_screen_time 1 403.13 407.13
## + Avg_Night_Sleep 1 399.70 407.70
## - Mean_daily_Physical_and_Social_Activity 1 408.67 412.67
summary(stepwise_model)
##
## Call:
## glm(formula = Schoolwork_Pressure_Binary ~ Mean_daily_screen_time +
## Mean_daily_Physical_and_Social_Activity, family = binomial,
## data = df_selected)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.95455 0.28353 -6.894 5.44e-12
## Mean_daily_screen_time -0.06049 0.03371 -1.794 0.07273
## Mean_daily_Physical_and_Social_Activity 0.15424 0.05108 3.020 0.00253
##
## (Intercept) ***
## Mean_daily_screen_time .
## Mean_daily_Physical_and_Social_Activity **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.45 on 451 degrees of freedom
## Residual deviance: 399.70 on 449 degrees of freedom
## AIC: 405.7
##
## Number of Fisher Scoring iterations: 4
write.csv(df_selected, file = "df_selected.csv", row.names = FALSE)
write.csv(df_imputed, file = "df_imputed.csv", row.names = FALSE)
# Create nodes data frame
nodes <- data.frame(
id = c("center", "screen_time", "activity", "home_occupants", "sleep"),
label = c("Perception on\nSchool Work Pressure",
"Mean Daily\nScreen Time",
"Mean Daily\nPhysical &\nSocial Activity",
"Home \nOccupants",
"Average \nNight Sleep"),
x = c(0, -1, 1, -1, 1),
y = c(0, 1, 1, -1, -1),
size = c(80, 80, 80, 80, 80)
)
# Create edges data frame (connections between nodes)
edges <- data.frame(
from = c("screen_time", "activity", "home_occupants", "sleep"),
to = rep("center", 4),
arrow.size = 0.5
)
# Create the graph
g <- graph_from_data_frame(edges, vertices = nodes, directed = TRUE)
# Plot the graph
plot(g,
layout = as.matrix(nodes[, c("x", "y")]),
vertex.label = nodes$label,
vertex.size = nodes$size,
vertex.color = c("darkseagreen3", rep("lightsalmon", 5)),
vertex.frame.color = "gray",
vertex.label.color = "black",
vertex.label.cex = 0.8,
edge.arrow.size = 0.3,
edge.arrow.width = 1,
edge.color = "red4",
main = "Factors that could influenced Perception of School Work Pressure")