library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("C:/Users/13074/Desktop/course/PA470/Assignment 1/Exam_Score_Prediction.csv")
str(data)
## 'data.frame':    20000 obs. of  12 variables:
##  $ student_id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age             : int  17 23 22 20 20 23 17 22 18 17 ...
##  $ course          : chr  "diploma" "bca" "b.sc" "diploma" ...
##  $ study_hours     : num  2.78 3.37 7.88 0.67 0.89 3.48 1.35 5.48 2.89 6.77 ...
##  $ class_attendance: num  92.9 64.8 76.8 48.4 71.6 65.4 69 51.1 92 44.8 ...
##  $ internet_access : chr  "yes" "yes" "yes" "yes" ...
##  $ sleep_hours     : num  7.4 4.6 8.5 5.8 9.8 4.2 7.4 8.2 6.6 9.8 ...
##  $ sleep_quality   : chr  "poor" "average" "poor" "average" ...
##  $ study_method    : chr  "coaching" "online videos" "coaching" "online videos" ...
##  $ facility_rating : chr  "low" "medium" "high" "low" ...
##  $ exam_difficulty : chr  "hard" "moderate" "moderate" "moderate" ...
##  $ exam_score      : num  58.9 54.8 90.3 29.7 43.7 58.2 53.7 47.3 44.9 77.7 ...
glimpse(data)
## Rows: 20,000
## Columns: 12
## $ student_id       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ age              <int> 17, 23, 22, 20, 20, 23, 17, 22, 18, 17, 21, 24, 22, 2…
## $ course           <chr> "diploma", "bca", "b.sc", "diploma", "diploma", "b.te…
## $ study_hours      <dbl> 2.78, 3.37, 7.88, 0.67, 0.89, 3.48, 1.35, 5.48, 2.89,…
## $ class_attendance <dbl> 92.9, 64.8, 76.8, 48.4, 71.6, 65.4, 69.0, 51.1, 92.0,…
## $ internet_access  <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
## $ sleep_hours      <dbl> 7.4, 4.6, 8.5, 5.8, 9.8, 4.2, 7.4, 8.2, 6.6, 9.8, 5.8…
## $ sleep_quality    <chr> "poor", "average", "poor", "average", "poor", "good",…
## $ study_method     <chr> "coaching", "online videos", "coaching", "online vide…
## $ facility_rating  <chr> "low", "medium", "high", "low", "low", "low", "high",…
## $ exam_difficulty  <chr> "hard", "moderate", "moderate", "moderate", "moderate…
## $ exam_score       <dbl> 58.9, 54.8, 90.3, 29.7, 43.7, 58.2, 53.7, 47.3, 44.9,…
colSums(is.na(data))
##       student_id              age           course      study_hours 
##                0                0                0                9 
## class_attendance  internet_access      sleep_hours    sleep_quality 
##               11                0                0                0 
##     study_method  facility_rating  exam_difficulty       exam_score 
##                0                0                0                0
summary(data)
##    student_id         age           course           study_hours   
##  Min.   :    1   Min.   :17.00   Length:20000       Min.   :0.080  
##  1st Qu.: 5001   1st Qu.:18.00   Class :character   1st Qu.:2.000  
##  Median :10000   Median :20.00   Mode  :character   Median :4.040  
##  Mean   :10001   Mean   :20.47                      Mean   :4.008  
##  3rd Qu.:15000   3rd Qu.:22.00                      3rd Qu.:6.000  
##  Max.   :20001   Max.   :24.00                      Max.   :7.910  
##                                                     NA's   :9      
##  class_attendance internet_access     sleep_hours    sleep_quality     
##  Min.   :40.60    Length:20000       Min.   :4.100   Length:20000      
##  1st Qu.:55.10    Class :character   1st Qu.:5.500   Class :character  
##  Median :69.90    Mode  :character   Median :7.000   Mode  :character  
##  Mean   :70.02                       Mean   :7.009                     
##  3rd Qu.:85.00                       3rd Qu.:8.500                     
##  Max.   :99.40                       Max.   :9.900                     
##  NA's   :11                                                            
##  study_method       facility_rating    exam_difficulty      exam_score    
##  Length:20000       Length:20000       Length:20000       Min.   : 19.60  
##  Class :character   Class :character   Class :character   1st Qu.: 48.80  
##  Mode  :character   Mode  :character   Mode  :character   Median : 62.60  
##                                                           Mean   : 62.51  
##                                                           3rd Qu.: 76.30  
##                                                           Max.   :100.00  
## 
#summary the numeric variables
data%>%
  select(where(is.numeric))%>%
           summary()
##    student_id         age         study_hours    class_attendance
##  Min.   :    1   Min.   :17.00   Min.   :0.080   Min.   :40.60   
##  1st Qu.: 5001   1st Qu.:18.00   1st Qu.:2.000   1st Qu.:55.10   
##  Median :10000   Median :20.00   Median :4.040   Median :69.90   
##  Mean   :10001   Mean   :20.47   Mean   :4.008   Mean   :70.02   
##  3rd Qu.:15000   3rd Qu.:22.00   3rd Qu.:6.000   3rd Qu.:85.00   
##  Max.   :20001   Max.   :24.00   Max.   :7.910   Max.   :99.40   
##                                  NA's   :9       NA's   :11      
##   sleep_hours      exam_score    
##  Min.   :4.100   Min.   : 19.60  
##  1st Qu.:5.500   1st Qu.: 48.80  
##  Median :7.000   Median : 62.60  
##  Mean   :7.009   Mean   : 62.51  
##  3rd Qu.:8.500   3rd Qu.: 76.30  
##  Max.   :9.900   Max.   :100.00  
##