1. Membaca Data
# Membaca dataset
data <- read.csv("college_students_habits_1M.csv")
# Melihat struktur data
str(data)
## 'data.frame': 1000000 obs. of 42 variables:
## $ study_hours : num 3.015 3.665 2.704 3.445 0.193 ...
## $ attendance : num 67 73.3 72.3 74.8 55.1 ...
## $ assignment_completion : num 51.6 69.7 92.8 85.2 64.5 ...
## $ midterm_score : num 57.2 57.6 44.6 52 32.8 ...
## $ final_score : num 61.7 62.1 46.5 55.4 32.4 ...
## $ project_score : num 65.4 65.7 53.6 60.6 42.6 ...
## $ backlogs : int 4 3 2 2 5 3 1 0 3 2 ...
## $ sleep_hours : num 5.99 6.95 6.7 6.5 6.55 ...
## $ stress : num 4.29 1.84 3.86 5.07 1 ...
## $ anxiety : num 58.1 41.9 56.6 65.2 30.7 ...
## $ depression : num 53.1 36.9 51.6 60.2 25.7 ...
## $ motivation : num 4.5 6.25 7.3 6.51 6.28 ...
## $ concentration : num 5.27 6.37 4.53 4.46 5.47 ...
## $ time_management : num 4.77 5.98 7.52 7.01 5.63 ...
## $ self_discipline : num 4.77 5.98 7.52 7.01 5.63 ...
## $ social_media_hours : num 4.03 2.14 2 2.09 4 ...
## $ gaming_hours : num 2.69 1.42 1.33 1.4 2.67 ...
## $ netflix_hours : num 2.69 1.42 1.33 1.4 2.67 ...
## $ screen_time : num 9.4 4.98 4.66 4.88 9.33 ...
## $ physical_activity : num 3.31 4.58 4.67 4.6 3.33 ...
## $ junk_food_frequency : num 3.69 2.42 2.33 2.4 3.67 ...
## $ caffeine_mg : num 213 151 169 183 180 ...
## $ late_night_frequency : num 2.69 1.42 1.33 1.4 2.67 ...
## $ procrastination_score : num 6.91 4.44 2.81 3.38 6.03 ...
## $ family_income : num 20108 47697 35156 23898 22091 ...
## $ parental_education_level: int 3 4 2 2 2 2 4 2 2 3 ...
## $ internet_quality : num 5.05 6.59 4.73 4.41 4.11 ...
## $ library_visits : num 1.814 1.837 0.971 1.469 0.188 ...
## $ online_courses_completed: int 0 0 1 1 0 0 2 4 0 1 ...
## $ part_time_hours : num 5.87 4.59 5.36 4.93 7.48 ...
## $ peer_study_group : int 0 0 0 0 0 0 1 1 0 0 ...
## $ relationship_status : int 1 0 0 0 1 0 0 0 1 0 ...
## $ hostel_student : int 0 0 1 1 1 1 0 1 1 0 ...
## $ extracurricular_hours : num 1.96 3.15 5.55 4.54 4.45 ...
## $ phone_unlocks_per_day : num 73.7 48.5 46.6 47.9 73.3 ...
## $ previous_gpa : num 5.72 5.76 4.46 5.2 3.28 ...
## $ class_participation : num 3.59 4.82 5.49 5.48 2.82 ...
## $ weekly_study_sessions : num 2.81 2.84 1.97 2.47 1.19 ...
## $ group_study_hours : num 1.814 1.837 0.971 1.469 0.188 ...
## $ financial_stress : num 5.49 2.88 5.7 6.6 4.6 ...
## $ gpa : num 0.547 0.707 0.868 0.729 0.371 ...
## $ performance_level : chr "Low" "Low" "Low" "Low" ...
2. Mengambil Variabel Numerik
# Memilih hanya variabel numerik
data_num <- data[, sapply(data, is.numeric)]
3. Menghitung Parameter Statistik
# Menghitung rata-rata dan standar deviasi
mean_values <- apply(data_num, 2, mean, na.rm = TRUE)
sd_values <- apply(data_num, 2, sd, na.rm = TRUE)
mean_values
## study_hours attendance assignment_completion
## 4.042036e+00 7.487637e+01 6.989670e+01
## midterm_score final_score project_score
## 6.001131e+01 6.485611e+01 6.797272e+01
## backlogs sleep_hours stress
## 2.550791e+00 6.499550e+00 3.211822e+00
## anxiety depression motivation
## 4.998615e+01 4.499145e+01 6.001923e+00
## concentration time_management self_discipline
## 6.002853e+00 6.001461e+00 6.001461e+00
## social_media_hours gaming_hours netflix_hours
## 3.014901e+00 2.009934e+00 2.009934e+00
## screen_time physical_activity junk_food_frequency
## 7.034769e+00 3.998554e+00 3.001828e+00
## caffeine_mg late_night_frequency procrastination_score
## 1.821618e+02 2.009934e+00 5.003990e+00
## family_income parental_education_level internet_quality
## 2.546035e+04 2.501398e+00 5.001144e+00
## library_visits online_courses_completed part_time_hours
## 2.010449e+00 9.633180e-01 4.999938e+00
## peer_study_group relationship_status hostel_student
## 5.007440e-01 3.092280e-01 4.998580e-01
## extracurricular_hours phone_unlocks_per_day previous_gpa
## 2.999902e+00 6.006881e+01 6.001131e+00
## class_participation weekly_study_sessions group_study_hours
## 5.008589e+00 3.002335e+00 2.511193e+00
## financial_stress gpa
## 4.998476e+00 8.311957e-01
sd_values
## study_hours attendance assignment_completion
## 2.212739e+00 1.289286e+01 1.468279e+01
## midterm_score final_score project_score
## 1.496727e+01 1.760637e+01 1.387704e+01
## backlogs sleep_hours stress
## 1.650993e+00 4.356498e-01 1.756919e+00
## anxiety depression motivation
## 1.499979e+01 1.499120e+01 9.737576e-01
## concentration time_management self_discipline
## 1.096555e+00 9.992839e-01 9.992839e-01
## social_media_hours gaming_hours netflix_hours
## 1.469558e+00 9.797051e-01 9.797051e-01
## screen_time physical_activity junk_food_frequency
## 3.428968e+00 9.997763e-01 9.985228e-01
## caffeine_mg late_night_frequency procrastination_score
## 2.745490e+01 9.797051e-01 1.657632e+00
## family_income parental_education_level internet_quality
## 1.478761e+04 1.032773e+00 1.000158e+00
## library_visits online_courses_completed part_time_hours
## 9.813614e-01 1.207893e+00 1.611714e+00
## peer_study_group relationship_status hostel_student
## 4.999997e-01 4.621756e-01 5.000002e-01
## extracurricular_hours phone_unlocks_per_day previous_gpa
## 9.985150e-01 1.988418e+01 1.496727e+00
## class_participation weekly_study_sessions group_study_hours
## 1.713582e+00 1.000212e+00 1.415504e+00
## financial_stress gpa
## 1.340340e+00 2.952901e-01
Membangkitkan Data Baru
# ingin membuat 1000 data baru
set.seed(123)
n <- 1000
data_baru <- data.frame(
study_hours = rnorm(n, mean_values["study_hours"], sd_values["study_hours"]),
attendance = rnorm(n, mean_values["attendance"], sd_values["attendance"]),
sleep_hours = rnorm(n, mean_values["sleep_hours"], sd_values["sleep_hours"]),
stress = rnorm(n, mean_values["stress"], sd_values["stress"]),
motivation = rnorm(n, mean_values["motivation"], sd_values["motivation"]),
social_media_hours = rnorm(n, mean_values["social_media_hours"], sd_values["social_media_hours"]),
physical_activity = rnorm(n, mean_values["physical_activity"], sd_values["physical_activity"])
)
5. Melihat Data Hasil Bangkitan
head(data_baru)
## study_hours attendance sleep_hours stress motivation social_media_hours
## 1 2.801850 62.03767 6.276670 2.9477439 6.193315 2.288684
## 2 3.532713 61.46837 6.602772 2.6359793 6.634975 4.671965
## 3 7.491050 74.64455 6.263607 0.6675131 6.655318 1.329392
## 4 4.198053 73.17225 7.030707 1.9867495 4.751464 5.191343
## 5 4.328116 42.00804 6.575413 7.7771585 4.028983 4.361297
## 6 7.837027 88.29234 6.231509 3.1460868 8.149376 3.507395
## physical_activity
## 1 3.299482
## 2 4.994782
## 3 3.305963
## 4 3.895094
## 5 4.602285
## 6 3.390645
summary(data_baru)
## study_hours attendance sleep_hours stress
## Min. :-2.175 Min. : 35.58 Min. :5.259 Min. :-2.286
## 1st Qu.: 2.652 1st Qu.: 66.45 1st Qu.:6.214 1st Qu.: 2.088
## Median : 4.062 Median : 75.58 Median :6.478 Median : 3.197
## Mean : 4.078 Mean : 75.42 Mean :6.491 Mean : 3.196
## 3rd Qu.: 5.513 3rd Qu.: 84.59 3rd Qu.:6.779 3rd Qu.: 4.355
## Max. :11.214 Max. :118.59 Max. :7.990 Max. : 8.298
## motivation social_media_hours physical_activity
## Min. :2.947 Min. :-1.779 Min. :0.5855
## 1st Qu.:5.314 1st Qu.: 2.103 1st Qu.:3.3112
## Median :5.970 Median : 3.057 Median :3.9601
## Mean :5.971 Mean : 3.066 Mean :3.9657
## 3rd Qu.:6.621 3rd Qu.: 4.073 3rd Qu.:4.6840
## Max. :9.357 Max. : 8.475 Max. :7.2737
Simpan Data Bangkitan
write.csv(data_baru, "data_bangkitan_mahasiswa.csv", row.names = FALSE)